diff --git a/mindnlp/__init__.py b/mindnlp/__init__.py
index b87ab15d4..f44c5978b 100644
--- a/mindnlp/__init__.py
+++ b/mindnlp/__init__.py
@@ -17,6 +17,7 @@
 MindNLP library.
 """
 import os
+import sys
 import platform
 from packaging import version
 
@@ -46,8 +47,14 @@
 if version.parse(mindspore.__version__) < version.parse('2.3.0'):
     mindspore.mint = None
 
-from . import safetensors
-from . import transformers
-from . import evaluate
+from . import integrations
+
+import transformers
+import evaluate
+import mindtorch
+
+sys.modules["mindnlp.transformers"] = transformers
+sys.modules["mindnlp.evaluate"] = evaluate
+sys.modules["mindnlp.core"] = mindtorch
 
 __all__ = ['transformers', 'evaluate', 'core']
\ No newline at end of file
diff --git a/mindnlp/accelerate/__init__.py b/mindnlp/accelerate/__init__.py
deleted file mode 100644
index 573430bcb..000000000
--- a/mindnlp/accelerate/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""accelerate"""
-from .utils import (
-    # AutocastKwargs,
-    # DataLoaderConfiguration,
-    # DDPCommunicationHookType,
-    # DeepSpeedPlugin,
-    # DistributedDataParallelKwargs,
-    # FullyShardedDataParallelPlugin,
-    accelerate_distributed_type,
-    DistributedType,
-    # GradScalerKwargs,
-    # InitProcessGroupKwargs,
-    # ProfileKwargs,
-    # find_executable_batch_size,
-    infer_auto_device_map,
-    # is_rich_available,
-    # load_checkpoint_in_model,
-    # synchronize_rng_states,
-)
-
-from .big_modeling import (
-    # cpu_offload,
-    # cpu_offload_with_hook,
-    # disk_offload,
-    # dispatch_model,
-    init_empty_weights,
-    init_on_empty,
-    # load_checkpoint_and_dispatch,
-)
diff --git a/mindnlp/accelerate/accelerator.py b/mindnlp/accelerate/accelerator.py
deleted file mode 100644
index 8c31f3202..000000000
--- a/mindnlp/accelerate/accelerator.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""accelerate"""
-import os
-from contextlib import contextmanager
-from typing import Optional
-
-import mindspore
-from mindspore import nn
-from mindspore.communication import init
-
-from .state import AcceleratorState
-from .utils import (
-    MindFormersPlugin,
-    is_mindformers_available,
-    wait_for_everyone
-)
-from .utils import DistributedType,accelerate_distributed_type
-from ..utils import logging
-
-if is_mindformers_available():
-    from .utils import (
-        MindFormersEngine,
-        MindFormersOptimizerWrapper,
-        MindFormersSchedulerWrapper,
-        mindformers_initialize,
-        mindformers_prepare_data_loader,
-        mindformers_prepare_model_optimizer_scheduler
-    )
-
-logger = logging.get_logger(__name__)
-
-
-class Accelerator:
-    """
-    Creates an instance of an accelerator for distributed training (on Ascend)
-
-    Args:
-        mindformers_plugin (`MindFormersPlugin`, *optional*):
-            This argument is optional and can be configured directly using *accelerate config*
-    """
-
-    def __init__(
-            self,
-            mindformers_plugin: Optional[MindFormersPlugin] = None,
-    ):
-        # init mindformers_plugin from env variables
-        if mindformers_plugin is None:
-            mindformers_plugin = (
-                MindFormersPlugin() if accelerate_distributed_type == DistributedType.MINDFORMERS else None
-            )
-        else:
-            os.environ["ACCELERATE_USE_MINDFORMERS"] = "true"
-        self.state = AcceleratorState(mindformers_plugin=mindformers_plugin)
-
-        if mindformers_plugin:
-            if not is_mindformers_available():
-                raise ImportError("MindFormers is not installed. Please install it")
-            # The distributed backend required to initialize the communication service.
-            # Should be placed before Tensor and Parameter are created.
-            mindspore.set_context(mode=mindspore.PYNATIVE_MODE)
-            init()
-
-        # Internal references to the training objects
-        self._optimizers = []
-        self._models = []
-        self._schedulers = []
-        self._dataloaders = []
-        self._custom_objects = []
-
-    @property
-    def use_distributed(self):
-        return self.state.use_distributed
-
-    @property
-    def distributed_type(self):
-        return self.state.distributed_type
-
-    @property
-    def num_processes(self):
-        return self.state.num_processes
-
-    @property
-    def process_index(self):
-        return self.state.process_index
-
-    @property
-    def is_main_process(self):
-        """True for one process only."""
-        return self.state.is_main_process
-
-    def prepare(self, *args):
-        """
-        Prepare all objects passed in `args` for  distributed training. Then return them in the same order.
-        Args:
-            *args (list of objects):
-                Any of the following type of objects:
-
-                - `mindspore.dataset.GeneratorDataset`: MindSpore Dataloader
-                - `mindspore.nn.Cell`: MindSpore Module
-                - `mindspore.nn.optim.Optimizer`: MindSpore Optimizer
-                - `mindspore.nn.learning_rate_schedule.LearningRateSchedule`: MindSpore Scheduler
-
-        Returns: Prepared objects in the same order.
-
-        """
-        result = []
-
-        # Only support mindsormers and MULTI_NPU now
-        if self.distributed_type == DistributedType.MINDFORMERS:
-            result = self._prepare_mindformers(*args)
-        elif self.distributed_type == DistributedType.MULTI_NPU:
-            pass # nothing prepare for data parallel
-        return result
-
-    def _prepare_mindformers(self, *args):
-        mindformers_plugin = self.state.mindformers_plugin
-
-        model = None
-        optimizer = None
-        scheduler = None
-        batch_data = None
-        for obj in args:
-            if isinstance(obj, mindspore.dataset.GeneratorDataset) and batch_data is None:
-                batch_data = obj
-            elif isinstance(obj, mindspore.nn.Cell):
-                model = obj
-            elif isinstance(obj, mindspore.nn.optim.Optimizer):
-                optimizer = obj
-            elif isinstance(obj, mindspore.nn.learning_rate_schedule.LearningRateSchedule):
-                scheduler = obj
-
-        # Config is not correct now
-        if model is not None:
-            mindformers_plugin.set_model_args(model, batch_data)
-        if optimizer is not None:
-            mindformers_plugin.set_optimizer_args(optimizer)
-        if scheduler is not None:
-            mindformers_plugin.set_paralle_args(scheduler)
-        mindformers_plugin.set_training_args()
-
-        # initialize mindformers
-        mindformers_initialize(self, args_defaults=mindformers_plugin.mindformers_defualt_args)
-
-        (model, optimizer, scheduler) = mindformers_prepare_model_optimizer_scheduler(self)
-        self.wait_for_everyone()
-
-        counter = 0
-        result = []
-        for obj in args:
-            if isinstance(obj, mindspore.dataset.GeneratorDataset):
-                data_loader = mindformers_prepare_data_loader(self, obj)
-                result.append(data_loader)
-                counter += 1
-            else:
-                result.append(obj)
-
-        if model is not None:
-            model = MindFormersEngine(self, model, optimizer)
-        if optimizer is not None:
-            optimizer = MindFormersOptimizerWrapper(optimizer)
-        if scheduler is not None:
-            scheduler = MindFormersSchedulerWrapper(scheduler, optimizer)
-
-        for i in range(len(result)):
-            if isinstance(result[i], nn.Cell):
-                result[i] = model
-            elif isinstance(result[i], nn.Optimizer):
-                result[i] = optimizer
-            elif isinstance(result[i], nn.learning_rate_schedule.LearningRateSchedule):
-                result[i] = scheduler
-
-        if model is not None:
-            self._models.append(model)
-            if len(self._models) > 1:
-                raise AssertionError(
-                    "You can't use same `Accelerator()` instance with multiple models when using MindFormers."
-                )
-        if optimizer is not None:
-            self._optimizers.append(optimizer)
-
-        return tuple(result)
-
-    def backward(self, loss, **kwargs):
-        pass
-
-    @contextmanager
-    def main_process_first(self):
-        """
-        Lets the main process go first inside a with block.
-
-        The other processes will enter the with block after the main process exits.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> with accelerator.main_process_first():
-        ...     # This will be printed first by process 0 then in a seemingly
-        ...     # random order by the other processes.
-        ...     print(f"This will be printed by process {accelerator.process_index}")
-        ```
-        """
-        with self.state.main_process_first():
-            yield
-
-    def wait_for_everyone(self):
-        """
-        Will stop the execution of the current process until every other process has reached that point (so this does
-        nothing when the script is only run in one process). Useful to do before saving a model.
-
-        Example:
-
-        ```python
-        >>> # Assuming two GPU processes
-        >>> import time
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> if accelerator.is_main_process:
-        ...     time.sleep(2)
-        >>> else:
-        ...     print("I'm waiting for the main process to finish its sleep...")
-        >>> accelerator.wait_for_everyone()
-        >>> # Should print on every process at the same time
-        >>> print("Everyone is here")
-        ```
-        """
-        wait_for_everyone()
-
-    def print(self, *args, **kwargs):
-        """
-        Drop in replacement of `print()` to only print once per server.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> accelerator.print("Hello world!")
-        ```
-        """
-        self.state.print(*args, **kwargs)
diff --git a/mindnlp/accelerate/big_modeling.py b/mindnlp/accelerate/big_modeling.py
deleted file mode 100644
index 4d89409c7..000000000
--- a/mindnlp/accelerate/big_modeling.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""big modeling"""
-from contextlib import contextmanager
-try:
-    from mindspore._c_expression import TensorPy as Tensor_ # pylint: disable=no-name-in-module
-except:
-    from mindspore._c_expression import Tensor as Tensor_ # pylint: disable=no-name-in-module
-
-from mindnlp.utils.testing_utils import parse_flag_from_env
-from mindnlp.core import nn
-
-@contextmanager
-def init_empty_weights(include_buffers: bool = None):
-    """
-    A context manager under which models are initialized with all parameters on the meta device, therefore creating an
-    empty model. Useful when just initializing the model would blow the available RAM.
-
-    Args:
-        include_buffers (`bool`, *optional*):
-            Whether or not to also put all buffers on the meta device while initializing.
-
-    Example:
-
-    ```python
-    import torch.nn as nn
-    from accelerate import init_empty_weights
-
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-
-    <Tip warning={true}>
-
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-    Make sure to overwrite the default device_map param for [`load_checkpoint_and_dispatch`], otherwise dispatch is not
-    called.
-
-    </Tip>
-    """
-    if include_buffers is None:
-        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
-    with init_on_empty(include_buffers=include_buffers) as f:
-        yield f
-
-
-@contextmanager
-def init_on_empty(include_buffers: bool = None):
-    """
-    A context manager under which models are initialized with all parameters on the specified device.
-
-    Args:
-        device (`torch.device`):
-            Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*):
-            Whether or not to also put all buffers on the meta device while initializing.
-
-    Example:
-
-    ```python
-    import torch.nn as nn
-    from accelerate import init_on_device
-
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Linear(100, 100)  # on `cuda` device
-    ```
-    """
-    if include_buffers is None:
-        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
-
-    old_register_parameter = nn.Module.register_parameter
-    if include_buffers:
-        old_register_buffer = nn.Module.register_buffer
-
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            kwargs = module._parameters[name].__dict__
-            kwargs["requires_grad"] = param.requires_grad
-            module._parameters[name].assign_value(Tensor_(shape=module._parameters[name].shape, dtype=module._parameters[name].dtype))
-            module._parameters[name].meta = True
-
-    def register_empty_buffer(module, name, buffer, persistent=True):
-        old_register_buffer(module, name, buffer, persistent=persistent)
-        if buffer is not None:
-            module._buffers[name].assign_value(Tensor_(shape=module._parameters[name].shape, dtype=module._buffers[name].dtype))
-            module._buffers[name].meta = True
-
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        if include_buffers:
-            nn.Module.register_buffer = register_empty_buffer
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        if include_buffers:
-            nn.Module.register_buffer = old_register_buffer
diff --git a/mindnlp/accelerate/data_loader.py b/mindnlp/accelerate/data_loader.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/accelerate/inference.py b/mindnlp/accelerate/inference.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/accelerate/optimizer.py b/mindnlp/accelerate/optimizer.py
deleted file mode 100644
index 714695d3a..000000000
--- a/mindnlp/accelerate/optimizer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""accelerate optimizer"""
-from mindspore.nn.optim.optimizer import Optimizer
-from .state import AcceleratorState
-
-
-class AcceleratedOptimizer(Optimizer):
-    def __init__(self, optimizer):
-        super().__init__(learning_rate=optimizer.learning_rate, parameters=optimizer.parameters)
-        self.optimizer = optimizer
-        self.accelerator_state = AcceleratorState()
-
-    def step(self):
-        """
-        Performs a single optimization step.
-        """
-        self.optimizer.step()
-
-    def zero_grad(self):
-        """
-        Clears the gradients of all optimized tensors.
-        """
-        self.optimizer.zero_grad()
diff --git a/mindnlp/accelerate/scheduler.py b/mindnlp/accelerate/scheduler.py
deleted file mode 100644
index 944d55b07..000000000
--- a/mindnlp/accelerate/scheduler.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""accelerate scheduler."""
-
-
-class AcceleratedScheduler:
-    def __init__(self, scheduler, optimizers, step_with_optimizer: bool = True, split_batches: bool = False):
-        self.scheduler = scheduler
-        self.optimizers = optimizers if isinstance(optimizers, (list, tuple)) else [optimizers]
-        self.split_batches = split_batches
-        self.step_with_optimizer = step_with_optimizer
-
-    def step(self):
-        """
-        Performs a step of the scheduler.
-        """
diff --git a/mindnlp/accelerate/state.py b/mindnlp/accelerate/state.py
deleted file mode 100644
index 4dc3b4b19..000000000
--- a/mindnlp/accelerate/state.py
+++ /dev/null
@@ -1,458 +0,0 @@
-"""accelerate state"""
-from functools import partial
-from contextlib import contextmanager
-from typing import Callable, Any
-from mindspore import communication
-
-try:
-    from mindspore.communication.comm_func import barrier
-except:
-    barrier = None
-
-from .utils import (
-    is_mindformers_available
-)
-from ..accelerate.utils import accelerate_distributed_type, DistributedType
-
-SharedDict = dict
-
-
-# Lambda function that does nothing
-def do_nothing(*args, **kwargs):
-    return None
-
-
-class PartialState:
-    _shared_state = SharedDict()
-    _know_attrs = [
-        "_cpu",
-        "_mixed_precision",
-        "_shared_state",
-        "backend",
-        "debug",
-        "device",
-        "distributed_type",
-        "fork_launched",
-        "local_process_index",
-        "num_processes",
-        "process_index",
-    ]
-
-    def __init__(self, **kwargs):
-        self.__dict__ = self._shared_state
-        self._prepare_backend()
-
-        if self.backend == "hccl":
-            self.num_processes = communication.get_group_size()
-            self.process_index = communication.get_rank()
-
-    def __repr__(self) -> str:
-        return (
-            f"Distributed environment: {self.distributed_type}{('  Backend: ' + self.backend) if self.backend else ''}\n"
-            f"Num processes: {self.num_processes}\n"
-            f"Process index: {self.process_index}\n"
-        )
-
-    @staticmethod
-    def _reset_state():
-        """Resets `_shared_state`, is used internally and should not be called"""
-        PartialState._shared_state.clear()
-
-    @property
-    def initialized(self) -> bool:
-        """Returns whether the `PartialState` has been initialized"""
-        return self._shared_state
-
-    @property
-    def use_distributed(self):
-        """
-        Whether the Accelerator is configured for distributed training
-        """
-        return self.distributed_type != DistributedType.NO and self.num_processes > 1
-
-    @property
-    def is_last_process(self) -> bool:
-        """Returns whether the current process is the last one"""
-        return self.process_index == self.num_processes - 1
-
-    @property
-    def is_main_process(self) -> bool:
-        """Returns whether the current process is the main process"""
-        return (
-            self.process_index == 0 if self.distributed_type != DistributedType.MINDFORMERS else self.is_last_process
-        )
-
-    @property
-    def num_processes(self):
-        """Returns num process"""
-        return self.num_processes
-
-    @property
-    def process_index(self):
-        """Returns process index"""
-        return self.process_index
-
-    @property
-    def is_local_main_process(self) -> bool:
-        """Returns whether the current process is the main process on the local node"""
-        return (
-            self.local_process_index == 0
-            if self.distributed_type != DistributedType.MINDFORMERS
-            else self.is_last_process
-        )
-
-    def wait_for_everyone(self):
-        """
-        Will stop the execution of the current process until every other process has reached that point (so this does
-        nothing when the script is only run in one process). Useful to do before saving a model.
-
-        Example:
-
-        ```python
-        >>> # Assuming two GPU processes
-        >>> import time
-        >>> from accelerate.state import PartialState
-
-        >>> state = PartialState()
-        >>> if state.is_main_process:
-        ...     time.sleep(2)
-        >>> else:
-        ...     print("I'm waiting for the main process to finish its sleep...")
-        >>> state.wait_for_everyone()
-        >>> # Should print on every process at the same time
-        >>> print("Everyone is here")
-        ```
-        """
-        if self.distributed_type in (
-                DistributedType.MINDFORMERS,
-        ):
-            barrier()
-
-    def _goes_first(self, is_main: bool):
-        if not is_main:
-            self.wait_for_everyone()
-
-        yield
-
-        if is_main:
-            self.wait_for_everyone()
-
-    @contextmanager
-    def main_process_first(self):
-        """
-        Lets the main process go first inside a with block.
-
-        The other processes will enter the with block after the main process exits.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> with accelerator.main_process_first():
-        ...     # This will be printed first by process 0 then in a seemingly
-        ...     # random order by the other processes.
-        ...     print(f"This will be printed by process {accelerator.process_index}")
-        ```
-        """
-        yield from self._goes_first(self.is_main_process)
-
-    @contextmanager
-    def local_main_process_first(self):
-        """
-        Lets the local main process go inside a with block.
-
-        The other processes will enter the with block after the main process exits.
-
-        Example:
-
-        ```python
-        >>> from accelerate.state import PartialState
-
-        >>> state = PartialState()
-        >>> with state.local_main_process_first():
-        ...     # This will be printed first by local process 0 then in a seemingly
-        ...     # random order by the other processes.
-        ...     print(f"This will be printed by process {state.local_process_index}")
-        ```
-        """
-        yield from self._goes_first(self.is_local_main_process)
-
-    def on_main_process(self, function: Callable[..., Any] = None):
-        """
-        Decorator that only runs the decorated function on the main process.
-
-        Args:
-            function (`Callable`): The function to decorate.
-
-        Example:
-
-        ```python
-        >>> from accelerate.state import PartialState
-
-        >>> state = PartialState()
-
-
-        >>> @state.on_main_process
-        ... def print_something():
-        ...     print("This will be printed by process 0 only.")
-
-
-        >>> print_something()
-        "This will be printed by process 0 only"
-        ```
-        """
-        if not self.initialized:
-            raise ValueError("The `PartialState` or `Accelerator` must be initialized before calling this function.")
-        if self.is_main_process or not self.use_distributed:
-            return function
-        return do_nothing
-
-    def on_local_main_process(self, function: Callable[..., Any] = None):
-        """
-        Decorator that only runs the decorated function on the local main process.
-
-        Args:
-            function (`Callable`): The function to decorate.
-
-        Example:
-        ```python
-        # Assume we have 2 servers with 4 processes each.
-        from accelerate.state import PartialState
-
-        state = PartialState()
-
-
-        @state.on_local_main_process
-        def print_something():
-            print("This will be printed by process 0 only on each server.")
-
-
-        print_something()
-        # On server 1:
-        "This will be printed by process 0 only"
-        # On server 2:
-        "This will be printed by process 0 only"
-        ```
-        """
-        if self.is_local_main_process or not self.use_distributed:
-            return function
-        return do_nothing
-
-    def on_last_process(self, function: Callable[..., Any]):
-        """
-        Decorator that only runs the decorated function on the last process.
-
-        Args:
-            function (`Callable`): The function to decorate.
-
-        Example:
-        ```python
-        # Assume we have 4 processes.
-        from accelerate.state import PartialState
-
-        state = PartialState()
-
-
-        @state.on_last_process
-        def print_something():
-            print(f"Printed on process {state.process_index}")
-
-
-        print_something()
-        "Printed on process 3"
-        ```
-        """
-        if self.is_last_process or not self.use_distributed:
-            return function
-        return do_nothing
-
-    def on_process(self, function: Callable[..., Any] = None, process_index: int = None):
-        """
-        Decorator that only runs the decorated function on the process with the given index.
-
-        Args:
-            function (`Callable`, `optional`):
-                The function to decorate.
-            process_index (`int`, `optional`):
-                The index of the process on which to run the function.
-
-        Example:
-        ```python
-        # Assume we have 4 processes.
-        from accelerate.state import PartialState
-
-        state = PartialState()
-
-
-        @state.on_process(process_index=2)
-        def print_something():
-            print(f"Printed on process {state.process_index}")
-
-
-        print_something()
-        "Printed on process 2"
-        ```
-        """
-        if function is None:
-            return partial(self.on_process, process_index=process_index)
-        if (self.process_index == process_index) or (not self.use_distributed):
-            return function
-        return do_nothing
-
-    def on_local_process(self, function: Callable[..., Any] = None, local_process_index: int = None):
-        """
-        Decorator that only runs the decorated function on the process with the given index on the current node.
-
-        Args:
-            function (`Callable`, *optional*):
-                The function to decorate.
-            local_process_index (`int`, *optional*):
-                The index of the local process on which to run the function.
-
-        Example:
-        ```python
-        # Assume we have 2 servers with 4 processes each.
-        from accelerate import Accelerator
-
-        accelerator = Accelerator()
-
-
-        @accelerator.on_local_process(local_process_index=2)
-        def print_something():
-            print(f"Printed on process {accelerator.local_process_index}")
-
-
-        print_something()
-        # On server 1:
-        "Printed on process 2"
-        # On server 2:
-        "Printed on process 2"
-        ```
-        """
-        if function is None:
-            return partial(self.on_local_process, local_process_index=local_process_index)
-        if (self.local_process_index == local_process_index) or (not self.use_distributed):
-            return function
-        return do_nothing
-
-    def print(self, *args, **kwargs):
-        if self.is_local_main_process:
-            print(*args, **kwargs)
-
-    def _prepare_backend(self):
-        # now mindformers and mindspore data parallel only
-        if accelerate_distributed_type == DistributedType.MINDFORMERS and is_mindformers_available():
-            self.backend = "hccl"
-            self.distributed_type = DistributedType.MINDFORMERS
-        elif accelerate_distributed_type == DistributedType.MULTI_NPU:
-            self.backend = "hccl"
-            self.distributed_type = DistributedType.MULTI_NPU
-
-    @num_processes.setter
-    def num_processes(self, value):
-        self._num_processes = value
-
-    @process_index.setter
-    def process_index(self, value):
-        self._process_index = value
-
-
-class AcceleratorState:
-    _shared_state = SharedDict()
-    _know_attrs = PartialState._know_attrs + [
-        "mindformers_plugin"
-    ]
-
-    def __init__(self, mindformers_plugin=None, **kwargs):
-        self.__dict__ = self._shared_state
-        if PartialState._shared_state:
-            PartialState(**kwargs)
-        self.__dict__.update(PartialState._shared_state)
-        # set distributed_type
-        if accelerate_distributed_type == DistributedType.MULTI_NPU:
-            self.distributed_type = DistributedType.MULTI_NPU
-        elif accelerate_distributed_type == DistributedType.MINDFORMERS:
-            self.distributed_type = DistributedType.MINDFORMERS
-            self.mindformers_plugin = mindformers_plugin
-        else:
-            self.distributed_type = DistributedType.NO
-
-        PartialState._shared_state["distributed_type"] = self.distributed_type
-
-    def __repr__(self):
-        return PartialState().__repr__()
-
-    @property
-    def initialized(self) -> bool:
-        return self._shared_state != PartialState._shared_state
-
-    @staticmethod
-    def _reset_state(reset_partial_state: bool = False):
-        """Resets `_shared_state`, is used internally and should not be called"""
-        AcceleratorState._shared_state.clear()
-        if reset_partial_state:
-            PartialState._reset_state()
-
-    @property
-    def use_distributed(self):
-        """
-        Whether the Accelerator is configured for distributed training
-        """
-        return PartialState().use_distributed
-
-    @property
-    def is_last_process(self) -> bool:
-        """Returns whether the current process is the last one"""
-        return PartialState().is_last_process
-
-    @property
-    def is_main_process(self) -> bool:
-        """Returns whether the current process is the main process"""
-        return PartialState().is_main_process
-
-    @property
-    def is_local_main_process(self) -> bool:
-        """Returns whether the current process is the main process on the local node"""
-        return PartialState().is_local_main_process
-
-    @property
-    def num_processes(self):
-        """Returns num process"""
-        return PartialState().num_processes
-
-    @property
-    def process_index(self):
-        """Returns process index"""
-        return PartialState().process_index
-
-    def wait_for_everyone(self):
-        """
-        Will stop the execution of the current process until every other process has reached that point (so this does
-        nothing when the script is only run in one process). Useful to do before saving a model.
-        """
-        PartialState().wait_for_everyone()
-
-    @contextmanager
-    def main_process_first(self):
-        """
-        Lets the main process go first inside a with block.
-
-        The other processes will enter the with block after the main process exits.
-        """
-        with PartialState().main_process_first():
-            yield
-
-    @contextmanager
-    def local_main_process_first(self):
-        """
-        Lets the local main process go inside a with block.
-
-        The other processes will enter the with block after the main process exits.
-        """
-        with PartialState().local_main_process_first():
-            yield
-
-    def print(self, *args, **kwargs):
-        PartialState().print(*args, **kwargs)
diff --git a/mindnlp/accelerate/utils/__init__.py b/mindnlp/accelerate/utils/__init__.py
deleted file mode 100644
index 98bdbb08c..000000000
--- a/mindnlp/accelerate/utils/__init__.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""accelerate utils"""
-from .constants import accelerate_distributed_type
-from .dataclasses import (
-    DistributedType,
-    MindFormersPlugin
-)
-from .environment import (
-    str_to_bool
-)
-from .imports import (
-    is_mindformers_available
-)
-from .modeling import (
-    # calculate_maximum_sizes,
-    # check_device_map,
-    check_tied_parameters_in_config,
-    check_tied_parameters_on_same_device,
-    compute_module_sizes,
-    convert_file_size_to_int,
-    dtype_byte_size,
-    find_tied_parameters,
-    get_balanced_memory,
-    get_max_layer_size,
-    get_max_memory,
-    # get_mixed_precision_context_manager,
-    # id_tensor_storage,
-    infer_auto_device_map,
-    # is_peft_model,
-    # load_checkpoint_in_model,
-    # load_offloaded_weights,
-    # load_state_dict,
-    named_module_tensors,
-    modify_model_for_pp_infer,
-    find_usefull_files,
-    # retie_parameters,
-    # set_module_tensor_to_device,
-    # shard_checkpoint,
-)
-
-from .other import (
-    wait_for_everyone
-)
-
-from .mindformers import (
-    MindFormersDummyDataLoader,
-    MindFormersDummyScheduler
-)
-
-if is_mindformers_available():
-    from .mindformers import (
-        MindFormersEngine,
-        MindFormersOptimizerWrapper,
-        MindFormersSchedulerWrapper,
-        initialize as mindformers_initialize,
-        prepare_data_loader as mindformers_prepare_data_loader,
-        prepare_model_optimizer_scheduler as mindformers_prepare_model_optimizer_scheduler
-    )
diff --git a/mindnlp/accelerate/utils/config.py b/mindnlp/accelerate/utils/config.py
deleted file mode 100644
index 9bdd7eb1c..000000000
--- a/mindnlp/accelerate/utils/config.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""acclerate config"""
-from typing import Union, Optional
-from dataclasses import dataclass
-
-
-@dataclass
-class MindformersTrainningConfig:
-    seed: int = None
-    output_dir: str = "./output"
-    training_iters: int = 1
-    epochs: int = None
-    log_interval: int = None
-    eval_interval: int = None
-    save_interval: int = None
-    best_metric_comparison: str = None
-    eval_metric: str = None
-    grad_clip_kwargs: dict = None
-    loss_scale: Union[float, int] = None
-    loss_scale_value: Union[float, int] = None
-    loss_scale_factor: int = None
-    loss_scale_window: int = None
-    loss_reduction: str = "mean"
-    calculate_per_token_loss: bool = False
-    wrap_with_ddp: bool = False
-    overlap_grad_reduce: bool = False
-    use_distributed_optimizer: bool = False
-    bucket_size: Optional[int] = None
-    check_for_nan_in_grad: bool = False
-
-
-@dataclass
-class MindForemrsOptimizerConfig:
-    optimizer_type: str = "AdamWeightDecay"
-    learning_rate: float = 1e-3
-    learning_rate_scheduler_kwargs: dict = None
-    weight_decay: float = 0.0
-    weight_decay_kwargs: dict = None
-    zero_config: dict = None
-
-
-@dataclass
-class MindFormersModelParallelConfig:
-    tensor_parallel: int = 1
-    pipeline_stage: int = 1
-    context_parallel: int = 1
-    expert_parallel: int = 1
-    virtual_pipeline_model_parallel_size: int = None
-    micro_batch_num: int = 1
-    use_sequence_parallel: bool = False
-    recv_dtype: str = "float32"
-    zero_level: bool = None
-    gradient_accumulation_fusion: bool = False
-    standalone_embedding_stage: bool = False
-    overlap_p2p_comm: bool = False
-
-
-@dataclass
-class MindFormersDatasetConfig:
-    dataset_dir: str = "./dataset"
-    shuffle: bool = False
-    batch_size: int = 1
-    micro_batch_num: int = 1
-
-
-@dataclass
-class MindFormersTransformerConfig:
-    vocab_size: int
-    num_layers: int
-    num_heads: int
-    hidden_size: int
-    ffn_hidden_size: int
-    seq_length: int = None
-    attention_type: str = "self_attn"
-    position_embedding_type: str = 'absolute'
-    parallel_position_embedding: bool = False
-    rotary_config: dict = None
-    use_query_layer: bool = False
-    use_visual_encoder: bool = False
-    use_retriever: bool = False
-    use_gqa: bool = False
-    kv_num_heads: int = 32
-    qkv_has_bias: bool = True
-    out_proj_has_bias: bool = True
-    apply_query_key_layer_scaling: bool = False
-    use_flash_attention: bool = False
-    fa_config = None
-    mask_func_type: str = "attn_mask_add"
-    mlp_has_bias: bool = True
-    mlp_has_gate: bool = False
-    hidden_act: str = "gelu"
-    normalization: str = "LayerNorm"
-    layernorm_epsilon: float = 1.0e-5
-    apply_residual_connection_post_norm: bool = False
-    use_final_norm: bool = True
-    residual_connection_dtype: str = "float32"
-    init_method_std: float = 0.01
-    param_init_dtype: str = "float32"
-    embedding_init_dtype: str = "float32"
-    compute_dtype: str = "float16"
-    softmax_compute_dtype: str = "float32"
-    init_method: str = 'normal'
-    bias_init: str = 'zeros'
-    fp16_lm_cross_entropy: bool = False
-    hidden_dropout_rate: float = 0.0
-    attention_dropout_rate: float = 0.0
-    out_hidden_size: int = None
-    num_experts: int = None
-    untie_embeddings_and_output_weights: bool = False
-    flatten_labels_and_input_mask: bool = True
-    recompute_method: str = None
-    recompute_num_layers: int = None
-    recompute_granularity: str = None
diff --git a/mindnlp/accelerate/utils/constants.py b/mindnlp/accelerate/utils/constants.py
deleted file mode 100644
index 7fb87167f..000000000
--- a/mindnlp/accelerate/utils/constants.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""constants"""
-import os
-import mindspore
-import numpy
-from .dataclasses import DistributedType
-
-
-_random_seed = numpy.random.randint(1000)
-
-
-def _prepare_data_parallel_native_minspore():
-    # initialize data parallel hcc backend for data_loader and Trainer API
-    mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True)
-    mindspore.communication.init()
-    mindspore.set_seed(_random_seed)
-
-
-def detect_accelerate_distributed_type():
-    """
-    detect distributed_type
-
-    Returns:
-        _type_: According to the factors such as the available parallel software and hardware environment of the current system and the user-specified parallel scheme,
-          the optimal parallel strategy is comprehensively decided in different situations.
-    """
-    if os.environ.get("MULTI_NPU", None) == "true":
-        _prepare_data_parallel_native_minspore()
-        return DistributedType.MULTI_NPU
-    if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true":
-        return DistributedType.MINDFORMERS
-    else:
-        return DistributedType.NO
-
-accelerate_distributed_type = detect_accelerate_distributed_type()
diff --git a/mindnlp/accelerate/utils/dataclasses.py b/mindnlp/accelerate/utils/dataclasses.py
deleted file mode 100644
index 1371f9659..000000000
--- a/mindnlp/accelerate/utils/dataclasses.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""accelerate dataclasses"""
-import enum
-import functools
-from dataclasses import dataclass, asdict
-
-from mindnlp.accelerate.utils.config import (
-    MindformersTrainningConfig,
-    MindFormersModelParallelConfig,
-    MindForemrsOptimizerConfig,
-    MindFormersTransformerConfig
-)
-
-
-class DistributedType(str, enum.Enum):
-    """
-    Represents a type of distributed environment.
-
-    Values:
-        - **MINDFORMERS** -- Using mindformers
-        - **NO** -- Not a distributed environment, just a single process.
-        - **MULTI_NPU** -- Distributed data parallel on multiple NPUs.
-    """
-
-    MULTI_NPU = "MULTI_NPU"
-    MINDFORMERS = "MINDFORMERS"
-    NO = "NO"
-
-
-@dataclass
-class MindFormersPlugin:
-    """
-    Plugin for MindFormersLM to enable tensor, pipeline, sequence and data parallelism.
-    """
-
-    def __post_init__(self):
-        self.mindformers_default_args = {
-            "trainning_config": {},
-            "parallel_config": {},
-            "model_config": {},
-            "dataset_config": {},
-            "optimizer_config": {}
-        }
-
-    def set_trainning_args(self):
-        trainning_config = MindformersTrainningConfig()
-        self.mindformers_default_args["trainning_config"] = asdict(trainning_config)
-
-    def set_optimizer_args(self):
-        optimizer_config = MindForemrsOptimizerConfig()
-        self.mindformers_default_args["optimizer_config"] = asdict(optimizer_config)
-
-    def set_paralle_args(self):
-        parallel_config = MindFormersModelParallelConfig()
-        self.mindformers_default_args["parallel_config"] = asdict(parallel_config)
-
-    def set_model_args(self, model, batch_data):
-        model_config_type = model.config.model_type.lower()
-        MODEL_CONFIGS_TO_MINDFORMERS_PARSERS[model_config_type](self, model, batch_data)
-
-    @property
-    def config_dict(self):
-        return self.mindformers_default_args
-
-    @property
-    def model_type(self):
-        model_type = "llama"
-        return model_type
-
-
-MODEL_CONFIGS_TO_MINDFORMERS_PARSERS = {}
-
-
-def add_model_config_to_mindformers_parser(model_type: str):
-    def add_model_config_parser_helper(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            return func(*args, **kwargs)
-
-        MODEL_CONFIGS_TO_MINDFORMERS_PARSERS[model_type] = func
-        return wrapper
-
-    return add_model_config_parser_helper
-
-
-@add_model_config_to_mindformers_parser("llama")
-def parse_llama_config(mindformers_plugin, model, batch_data):
-    model_config = MindFormersTransformerConfig(
-        vocab_size=1200,
-        hidden_size=128,
-        ffn_hidden_size=512,
-        num_layers=2,
-        num_heads=8,
-    )
-    mindformers_plugin.mindformers_default_args["model_config"] = asdict(model_config)
diff --git a/mindnlp/accelerate/utils/environment.py b/mindnlp/accelerate/utils/environment.py
deleted file mode 100644
index 3a404f5d2..000000000
--- a/mindnlp/accelerate/utils/environment.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""accelerate environment utilities."""
-
-
-def str_to_bool(value) -> int:
-    """
-    Converts a string representation of truth to `True` (1) or `False` (0).
-
-    True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
-    """
-    value = value.lower()
-    if value in ("y", "yes", "t", "true", "on", "1"):
-        return 1
-    elif value in ("n", "no", "f", "false", "off", "0"):
-        return 0
-    else:
-        raise ValueError(f"invalid truth value {value}")
diff --git a/mindnlp/accelerate/utils/imports.py b/mindnlp/accelerate/utils/imports.py
deleted file mode 100644
index 36d90ce1d..000000000
--- a/mindnlp/accelerate/utils/imports.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""accelerate import utilities."""
-from mindnlp.utils.import_utils import _is_package_available
-
-
-def is_mindformers_available():
-    """
-    Checks if the MindFormers library is available in the current environment.
-    
-    Returns:
-        bool: True if MindFormers library is available, False otherwise.
-    """
-    _mindformers_available = _is_package_available("mindformers")
-    return _mindformers_available
diff --git a/mindnlp/accelerate/utils/mindformers.py b/mindnlp/accelerate/utils/mindformers.py
deleted file mode 100644
index 0a3c2c52c..000000000
--- a/mindnlp/accelerate/utils/mindformers.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""accelerate mindformers core."""
-import functools
-
-from mindspore import nn, Tensor
-
-from ..optimizer import AcceleratedOptimizer
-from ..scheduler import AcceleratedScheduler
-from .imports import is_mindformers_available
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-if is_mindformers_available():
-    try:
-        from mindformers.experimental.model import LlamaForCausalLM  # pylint: disable=import-error
-        from mindformers.experimental.parallel_core.pynative.config import init_configs_from_dict  # pylint: disable=import-error
-        from mindformers.experimental.parallel_core.pynative.training import get_model, TrainOneStepCell  # pylint: disable=import-error
-        from mindformers.experimental.parallel_core.pynative.parallel_state import initialize_model_parallel  # pylint: disable=import-error
-        from mindformers.experimental.parallel_core.pynative import get_optimizer  # pylint: disable=import-error
-    except Exception as e:
-        logger.warning('cannot found `mindformers.experimental`, please install dev version by\n'
-                      '`pip install git+https://gitee.com/mindspore/mindformers` \n'
-                      'or remove mindformers by \n'
-                      '`pip uninstall mindformers`')
-
-_GLOBAL_CONFIG_DICT: dict
-
-
-def prepare_model_optimizer_scheduler(accelerator):
-    """
-    Prepare mindformers model and optimizer
-
-    Args:
-        accelerator: accelerator
-
-    Returns: model, optimizer
-
-    """
-    accelerator.print("Preparing model, optimizer...")
-
-    # load mindformers config
-    _CONFIG_DICT = accelerator.state.mindformers_plugin.config_dict
-    all_config = init_configs_from_dict(_CONFIG_DICT)
-    model_config = all_config.model_config
-    parallel_config = all_config.parallel_config
-    optimizer_config = all_config.optimizer_config
-
-    # get model and optimizer
-    model_type = accelerator.state.mindformers_plugin.model_type
-    model_provider_func = MODEL_PROVIDER_FUNC[model_type](model_config, True, True)
-    model = get_model(model_provider_func, parallel_config)
-    optimizer = get_optimizer(optimizer_config, model.trainable_params(), model)
-    scheduler = None
-
-    return model, optimizer, scheduler
-
-
-def prepare_data_loader(accelerator, dataloader):
-    """
-    Prepare dataloader in mindformers
-
-    Args:
-        accelerator: accelerator
-        dataloader: original dataloader
-
-    Returns: dataloader
-
-    """
-    accelerator.print("Preparing data loader...")
-
-    all_config = init_configs_from_dict(_GLOBAL_CONFIG_DICT)
-    dataset_config = all_config.dataset_config
-
-    # calculate global batch size
-    global_batch_size = dataset_config.batch_size * dataset_config.micro_batch_num
-    batch_dataloader = dataloader.batch(global_batch_size)
-
-    return batch_dataloader
-
-
-# optimizer utilities
-class MindFormersOptimizerWrapper(AcceleratedOptimizer):
-    #
-    # def __init__(self, optimizer):
-    #     super().__init__(optimizer)
-
-    def zero_grad(self, set_to_none=None):
-        pass  # `model(**batch)` is doing that automatically. Therefore, it's implementation is not needed
-
-    def step(self):
-        pass  # `model(**batch)` is doing that automatically. Therefore, it's implementation is not needed
-
-
-class MindFormersSchedulerWrapper(AcceleratedScheduler):
-
-    # def __init__(self, scheduler, optimizers):
-    #     super().__init__(scheduler, optimizers)
-
-    def step(self, *args, **kwargs):
-        return  # `model(**batch)` is doing that automatically. Therefore, it's implementation is not needed
-
-
-class MindFormersDummyDataLoader:
-    ...
-
-
-class MindFormersDummyScheduler:
-    ...
-
-
-def initialize(accelerator, args_defaults=None):
-    """
-    Intialize mindformers setup
-
-    Args:
-        accelerator: accelerator
-        args_defaults: args mindformers needed
-
-    """
-    if args_defaults is None:
-        args_defaults = {}
-    accelerator.print("Initializing MindFormers...")
-
-    global _GLOBAL_CONFIG_DICT
-    if _GLOBAL_CONFIG_DICT is None:
-        _GLOBAL_CONFIG_DICT = args_defaults
-
-    all_config = init_configs_from_dict(_GLOBAL_CONFIG_DICT)
-    parallel_config = all_config.parallel_config
-
-    initialize_model_parallel(
-        tensor_model_parallel_size=parallel_config.tensor_parallel,
-        pipeline_model_parallel_size=parallel_config.pipeline_stage,
-        virtual_pipeline_model_parallel_size=parallel_config.virtual_pipeline_model_parallel_size,
-        context_parallel=parallel_config.context_parallel,
-        expert_model_parallel_size=parallel_config.expert_parallel
-    )
-
-
-class MindFormersEngine(nn.Cell):
-    """
-    MindFormers model wrapper
-
-    Args:
-        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
-        model: MindFormers model
-        optimizer: MindFormers optimizer
-    """
-
-    def __init__(self, accelerator, model, optimizer):
-        super().__init__()
-        self.moddel = model
-        self.optimizer = optimizer
-
-        _CONFIG_DICT = accelerator.state.mindformers_plugin.config_dict
-        all_config = init_configs_from_dict(_CONFIG_DICT)
-        training_config = all_config.training_config
-        model_config = all_config.model_config
-
-        self.train_one_step = TrainOneStepCell(model, optimizer, training_config, model_config)
-
-    def construct(self, tuple_data):
-        if self.model.training:
-            self.train_one_step.set_train(True)
-            set_input_data = [
-                Tensor(shape=(None,) * len(input_data.shape), dtype=input_data.dtype) for input_data in tuple_data
-            ]
-            self.train_one_step_cell.set_inputs(*set_input_data)
-            self.train_one_step.set_inputs()
-            loss, is_finite, loss_scale, learning_rate = self.train_one_step_cell(**tuple_data)
-            return loss
-        else:
-            self.train_one_step.set_train(False)
-            self.train_one_step.forward_backward_func(forward_only=True, **tuple_data)
-
-            self.train_one_step.set_train(False)
-
-
-MODEL_PROVIDER_FUNC = {}
-
-
-def add_model_provider_func(model_type: str):
-    def add_model_provier_func_parser_helper(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            return func(*args, **kwargs)
-
-        MODEL_PROVIDER_FUNC[model_type] = func
-        return wrapper
-
-    return add_model_provier_func_parser_helper
-
-
-@add_model_provider_func("llama")
-def provider_llama(config, pre_process=True, post_process=True):
-    # load model config, then create model in mindformers
-    def model_provider(inner_pre_process=pre_process, inner_post_process=post_process):
-        model = LlamaForCausalLM(config=config, pre_process=pre_process, post_process=post_process)
-        return model
-
-    return model_provider
diff --git a/mindnlp/accelerate/utils/modeling.py b/mindnlp/accelerate/utils/modeling.py
deleted file mode 100644
index 8c132698c..000000000
--- a/mindnlp/accelerate/utils/modeling.py
+++ /dev/null
@@ -1,1174 +0,0 @@
-"""modeling utils for parallel inference"""
-# pylint: disable=unnecessary-comprehension
-import os
-import re
-import inspect
-import warnings
-import types
-from subprocess import Popen, PIPE
-from collections import OrderedDict, defaultdict
-from typing import Optional, Dict, Union, List, Tuple, Set
-import mindspore
-from mindspore.communication import get_group_size, get_rank
-from mindnlp.configs import SUPPORT_ASYNC_DIST_OP
-try:
-    if SUPPORT_ASYNC_DIST_OP:
-        from mindspore.communication.comm_func import send as isend, recv as irecv, broadcast
-    else:
-        from mindspore.communication.comm_func import isend, irecv, broadcast
-except:
-    from mindnlp.parallel.comm_func import isend, irecv, broadcast
-
-from ...core import nn, ops
-from ...utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def get_gpus_free_memory():
-    nvidia_smi = "nvidia-smi"
-
-    # Get ID, processing and memory utilization for all GPUs
-    try:
-        p = Popen([nvidia_smi,"--query-gpu=index,memory.free", "--format=csv,noheader,nounits"], stdout=PIPE)
-        stdout, stderror = p.communicate()
-    except:
-        return []
-    output = stdout.decode('UTF-8')
-
-    lines = output.split(os.linesep)
-
-    numDevices = len(lines)-1
-    GPUs = []
-    for g in range(numDevices):
-        line = lines[g]
-
-        vals = line.split(', ')
-
-        for i in range(2):
-            if (i == 0):
-                deviceIds = int(vals[i])
-            elif (i == 1):
-                memFree = int(vals[i])
-
-        GPUs.append((deviceIds, memFree))
-    return GPUs
-
-def get_npus_free_memory():
-    try:
-        p = Popen(['npu-smi',"info"], stdout=PIPE)
-        stdout, stderror = p.communicate()
-    except:
-        return []
-    output = stdout.decode('UTF-8')
-    lines = output.split(os.linesep)
-    table_data = []
-    i = 6
-    while True:
-        if lines[i] == "+---------------------------+---------------+----------------------------------------------------+":
-            break
-        row1 = lines[i].split()
-        row1 = [i for i in row1 if i not in ('|', '/')]
-        row2 = lines[i + 1].split()
-        row2 = [i for i in row2 if i not in ('|', '/')]
-        table_data.append((row1[0], (int(row2[-1]) - int(row2[-2].replace('/', '')) // 1000)))
-        i += 3
-
-    return table_data
-
-
-def dtype_byte_size(dtype):
-    """
-    Returns the size (in bytes) occupied by one parameter of type `dtype`.
-
-    Example:
-
-    ```py
-    >>> dtype_byte_size(mindspore.float32)
-    4
-    ```
-    """
-    if dtype == mindspore.bool_:
-        return 1 / 8
-    # elif dtype == CustomDtype.INT2:
-    #     return 1 / 4
-    # elif dtype == CustomDtype.INT4:
-    #     return 1 / 2
-    # elif dtype == CustomDtype.FP8:
-        # return 1
-    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
-def convert_file_size_to_int(size: Union[int, str]):
-    """
-    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
-
-    Args:
-        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
-
-    Example:
-
-    ```py
-    >>> convert_file_size_to_int("1MiB")
-    1048576
-    ```
-    """
-    mem_size = -1
-    err_msg = (
-        f"`size` {size} is not in a valid format. Use an integer for bytes, or a string with an unit (like '5.0GB')."
-    )
-    try:
-        if isinstance(size, int):
-            mem_size = size
-        elif size.upper().endswith("GIB"):
-            mem_size = int(float(size[:-3]) * (2**30))
-        elif size.upper().endswith("MIB"):
-            mem_size = int(float(size[:-3]) * (2**20))
-        elif size.upper().endswith("KIB"):
-            mem_size = int(float(size[:-3]) * (2**10))
-        elif size.upper().endswith("GB"):
-            int_size = int(float(size[:-2]) * (10**9))
-            mem_size = int_size // 8 if size.endswith("b") else int_size
-        elif size.upper().endswith("MB"):
-            int_size = int(float(size[:-2]) * (10**6))
-            mem_size = int_size // 8 if size.endswith("b") else int_size
-        elif size.upper().endswith("KB"):
-            int_size = int(float(size[:-2]) * (10**3))
-            mem_size = int_size // 8 if size.endswith("b") else int_size
-    except ValueError:
-        raise ValueError(err_msg)
-
-    if mem_size < 0:
-        raise ValueError(err_msg)
-    return mem_size
-
-
-def get_module_leaves(module_sizes):
-    module_children = {}
-    for module in module_sizes:
-        if module == "" or "." not in module:
-            continue
-        parent = module.rsplit(".", 1)[0]
-        module_children[parent] = module_children.get(parent, 0) + 1
-    leaves = [module for module in module_sizes if module_children.get(module, 0) == 0 and module != ""]
-    return leaves
-
-def get_balanced_memory(
-    model: nn.Module,
-    max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
-    no_split_module_classes: Optional[List[str]] = None,
-    dtype: Optional[Union[str, mindspore.dtype.TensorType]] = None,
-    special_dtypes: Optional[Dict[str, str]] = None,
-    low_zero: bool = False,
-):
-    """
-    Compute a `max_memory` dictionary for [`infer_auto_device_map`] that will balance the use of each available GPU.
-
-    <Tip>
-
-    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
-    meta device (as it would if initialized within the `init_empty_weights` context manager).
-
-    </Tip>
-
-    Args:
-        model (`nn.Module`):
-            The model to analyze.
-        max_memory (`Dict`, *optional*):
-            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
-            Example: `max_memory={0: "1GB"}`.
-        no_split_module_classes (`List[str]`, *optional*):
-            A list of layer class names that should never be split across device (for instance any layer that has a
-            residual connection).
-        dtype (`str` or `mindspore.dtype.TensorType`, *optional*):
-            If provided, the weights will be converted to that type when loaded.
-        special_dtypes (`Dict[str, str]`, *optional*):
-            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
-            all weights).
-        low_zero (`bool`, *optional*):
-            Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the
-            Transformers generate function).
-    """
-    # Get default / clean up max_memory
-    user_not_set_max_memory = max_memory is None
-    max_memory = get_max_memory(max_memory)
-
-    num_devices = len([d for d in max_memory if max_memory[d] > 0])
-
-    if num_devices == 0:
-        return max_memory
-
-    if num_devices == 1:
-        # We cannot do low_zero on just one GPU, but we will still reserve some memory for the buffer
-        low_zero = False
-        # If user just asked us to handle memory usage, we should avoid OOM
-        if user_not_set_max_memory:
-            for key in max_memory.keys():
-                if isinstance(key, int):
-                    max_memory[key] *= 0.9  # 90% is a good compromise
-                    logger.info(
-                        f"We will use 90% of the memory on device {key} for storing the model, and 10% for the buffer to avoid OOM. "
-                        "You can set `max_memory` in to a higher value to use more memory (at your own risk)."
-                    )
-                    break  # only one device
-
-    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
-    per_device = module_sizes[""] // (num_devices - 1 if low_zero else num_devices)
-
-    # We can't just set the memory to model_size // num_devices as it will end being too small: each GPU will get
-    # slightly less layers and some layers will end up offload at the end. So this function computes a buffer size to
-    # add which is the biggest of:
-    # - the size of no split block (if applicable)
-    # - the mean of the layer sizes
-    if no_split_module_classes is None:
-        no_split_module_classes = []
-    elif not isinstance(no_split_module_classes, (list, tuple)):
-        no_split_module_classes = [no_split_module_classes]
-
-    # Identify the size of the no_split_block modules
-    if len(no_split_module_classes) > 0:
-        no_split_children = {}
-        for name, size in module_sizes.items():
-            if name == "":
-                continue
-            submodule = model
-            for submodule_name in name.split("."):
-                submodule = getattr(submodule, submodule_name)
-            class_name = submodule.__class__.__name__
-            if class_name in no_split_module_classes and class_name not in no_split_children:
-                no_split_children[class_name] = size
-
-            if set(no_split_children.keys()) == set(no_split_module_classes):
-                break
-        buffer = max(no_split_children.values()) if len(no_split_children) > 0 else 0
-    else:
-        buffer = 0
-
-    # Compute mean of final modules. In the first dict of module sizes, leaves are the parameters
-    leaves = get_module_leaves(module_sizes)
-    module_sizes = {n: v for n, v in module_sizes.items() if n not in leaves}
-    # Once removed, leaves are the final modules.
-    leaves = get_module_leaves(module_sizes)
-    mean_leaves = int(sum(module_sizes[n] for n in leaves) / max(len(leaves), 1))
-    buffer = int(1.25 * max(buffer, mean_leaves))
-    per_device += buffer
-
-    # Sorted list of GPUs id (we may have some gpu ids not included in the our max_memory list - let's ignore them)
-    gpus_idx_list = list(
-        sorted(
-            device_id for device_id, device_mem in max_memory.items() if isinstance(device_id, int) and device_mem > 0
-        )
-    )
-    # The last device is left with max_memory just in case the buffer is not enough.
-    for idx in gpus_idx_list[:-1]:
-        max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_device, max_memory[idx])
-
-    if low_zero:
-        min_zero = max(0, module_sizes[""] - sum(max_memory[i] for i in range(1, num_devices)))
-        max_memory[0] = min(min_zero, max_memory[0])
-
-    return max_memory
-
-def infer_auto_device_map(
-    model: nn.Module,
-    max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
-    no_split_module_classes: Optional[List[str]] = None,
-    dtype: Optional[Union[str, mindspore.dtype.TensorType]] = None,
-    special_dtypes: Optional[Dict[str, str]] = None,
-    verbose: bool = False,
-    clean_result: bool = True,
-    offload_buffers: bool = False,
-):
-    """
-    Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
-    such that:
-    - we don't exceed the memory available of any of the GPU.
-    - if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
-      has the largest size.
-    - if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
-    - if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
-      that has the largest size.
-
-    <Tip>
-
-    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
-    meta device (as it would if initialized within the `init_empty_weights` context manager).
-
-    </Tip>
-
-    Args:
-        model (`nn.Module`):
-            The model to analyze.
-        max_memory (`Dict`, *optional*):
-            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
-            Example: `max_memory={0: "1GB"}`.
-        no_split_module_classes (`List[str]`, *optional*):
-            A list of layer class names that should never be split across device (for instance any layer that has a
-            residual connection).
-        dtype (`str` or `mindspore.dtype.TensorType`, *optional*):
-            If provided, the weights will be converted to that type when loaded.
-        special_dtypes (`Dict[str, str]`, *optional*):
-            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
-            all weights).
-        verbose (`bool`, *optional*, defaults to `False`):
-            Whether or not to provide debugging statements as the function builds the device_map.
-        clean_result (`bool`, *optional*, defaults to `True`):
-            Clean the resulting device_map by grouping all submodules that go on the same device together.
-        offload_buffers (`bool`, *optional*, defaults to `False`):
-            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
-            well as the parameters.
-    """
-    # Get default / clean up max_memory
-    max_memory = get_max_memory(max_memory)
-    if no_split_module_classes is None:
-        no_split_module_classes = []
-    elif not isinstance(no_split_module_classes, (list, tuple)):
-        no_split_module_classes = [no_split_module_classes]
-
-    devices = list(max_memory.keys())
-    if "disk" not in devices:
-        devices.append("disk")
-    gpus = [device for device in devices if device not in ["cpu", "disk"]]
-
-    # Devices that need to keep space for a potential offloaded layer.
-    if "mps" in gpus:
-        main_devices = ["mps"]
-    elif len(gpus) > 0:
-        main_devices = [gpus[0], "cpu"]
-    else:
-        main_devices = ["cpu"]
-
-    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
-    tied_parameters = find_tied_parameters(model)
-
-    if check_tied_parameters_in_config(model) and len(tied_parameters) == 0:
-        logger.warn(
-            "The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function."
-        )
-
-    device_map = OrderedDict()
-    current_device = 0
-    current_memory_used = 0
-    device_memory_used = {}
-    device_buffer_sizes = {}
-
-    # Direct submodules and parameters
-    modules_to_treat = (
-        list(model.named_parameters(recurse=False))
-        + list(model.named_children())
-        + list(model.named_buffers(recurse=False))
-    )
-    # Initialize maximum largest layer, to know which space to keep in memory
-    max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
-
-    # Ready ? This is going to be a bit messy.
-    while len(modules_to_treat) > 0:
-        name, module = modules_to_treat.pop(0)
-        if verbose:
-            print(f"\nTreating module {name}.")
-        # Max size in the remaining layers may have changed since we took one, so we maybe update it.
-        max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
-        if len(max_layer_names) == 0:
-            max_layer_size, max_layer_names = get_max_layer_size(
-                [(n, m) for n, m in modules_to_treat if isinstance(m, nn.Module)],
-                module_sizes,
-                no_split_module_classes,
-            )
-        # Assess size needed
-        module_size = module_sizes[name]
-
-        # We keep relevant tied parameters only: one of the tied parameters in the group is inside the current module
-        # and the other is not.
-        # Note: If we are currently processing the name `compute.weight`, an other parameter named e.g. `compute.weight_submodule.parameter`
-        # needs to be considered outside the current module, hence the check with additional dots.
-        tied_param_goups = [
-            tied_group
-            for tied_group in tied_parameters
-            if any(name + "." in k + "." for k in tied_group) and not all(name + "." in k + "." for k in tied_group)
-        ]
-
-        if verbose and len(tied_param_goups) > 0:
-            print(f"  Found the relevant tied param groups {tied_param_goups}")
-
-        # Then we keep track of all the parameters that are tied to the current module, but not in the current module
-        tied_params = sum(
-            [[p for p in tied_group if name + "." not in p + "."] for tied_group in tied_param_goups], []
-        )
-
-        if verbose and len(tied_params) > 0:
-            print(f"  So those parameters need to be taken into account {tied_params}")
-
-        device = devices[current_device]
-        current_max_size = max_memory[device] if device != "disk" else None
-        current_memory_reserved = 0
-        # Reduce max size available by the largest layer.
-        if devices[current_device] in main_devices:
-            current_max_size = current_max_size - max_layer_size
-            current_memory_reserved = max_layer_size
-        # Case 1 -> We're too big!
-        if current_max_size is not None and current_memory_used + module_size > current_max_size:
-            # Split or not split?
-            modules_children = (
-                []
-                if isinstance(module, (mindspore.Tensor, nn.Parameter))
-                else list(module.named_children())
-            )
-            if verbose:
-                print(
-                    f"Not enough space on {devices[current_device]} to put {name} (space available "
-                    f"{current_max_size - current_memory_used}, module size {module_size})."
-                )
-            if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
-                # -> no split, we go to the next device
-                if verbose:
-                    print("This module cannot be split, going to the next device.")
-
-                device_memory_used[device] = current_memory_used + current_memory_reserved
-                current_device += 1
-                modules_to_treat = [(name, module)] + modules_to_treat
-                current_memory_used = 0
-            else:
-                # -> split, we replace the module studied by its children + parameters
-                if verbose:
-                    print(f"Splitting {name}.")
-                modules_children = list(module.named_parameters(recurse=False)) + modules_children
-                modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
-                # Update the max layer size.
-                max_layer_size, max_layer_names = get_max_layer_size(
-                    [(n, m) for n, m in modules_to_treat if isinstance(m, nn.Module)],
-                    module_sizes,
-                    no_split_module_classes,
-                )
-
-        # Case 2, it fits! We're not entirely out of the wood though, because we may have some tied parameters.
-        elif len(tied_params) > 0:
-            # First locate all tied modules
-            tied_module_names = []
-            tied_modules = []
-            for tied_param in tied_params:
-                tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n in tied_param][0]
-                tied_module_names.append(modules_to_treat[tied_module_index][0])
-                tied_modules.append(modules_to_treat[tied_module_index][1])
-            if verbose:
-                print(
-                    f"  It looks like {name} is going to fit on {devices[current_device]} but we have tied "
-                    f"parameters to account for.\n  - Names {tied_params}\n  - Module names {tied_module_names}"
-                )
-
-            # Let's see if it all fits first
-            module_size_with_ties = module_size
-            for tied_param, tied_module_name in zip(tied_params, tied_module_names):
-                module_size_with_ties += module_sizes[tied_module_name] - module_sizes[tied_param]
-
-            if current_max_size is None or current_memory_used + module_size_with_ties <= current_max_size:
-                # We really really fit!
-                if verbose:
-                    print(f"Putting {name} and {tied_module_names} on {devices[current_device]}.")
-                current_memory_used += module_size_with_ties
-                device_map[name] = devices[current_device]
-                for tied_module_name in tied_module_names:
-                    if tied_module_name in [m[0] for m in modules_to_treat]:
-                        # The module may have been removed by a previous iteration of this loop.
-                        tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name][
-                            0
-                        ]
-                        modules_to_treat.pop(tied_module_index)
-                    device_map[tied_module_name] = devices[current_device]
-
-                if not offload_buffers and isinstance(module, nn.Module):
-                    current_buffer_size = compute_module_total_buffer_size(
-                        module, dtype=dtype, special_dtypes=special_dtypes
-                    )
-                    device_buffer_sizes[device] = device_buffer_sizes.get(device, 0) + current_buffer_size
-
-            else:
-                # We don't fit with the tied modules. Next question is: can we split one of the tied modules to make it
-                # smaller or do we need to go on the next device?
-                if verbose:
-                    print(
-                        f"Not enough space on {devices[current_device]} to put {name} and {tied_module_names} (space "
-                        f"available {current_max_size - current_memory_used}, needed size {module_size_with_ties})."
-                    )
-                split_happened = False
-                for tied_module_name, tied_module in zip(tied_module_names, tied_modules):
-                    tied_module_children = list(tied_module.named_children())
-                    if len(tied_module_children) == 0 or tied_module.__class__.__name__ in no_split_module_classes:
-                        # can't break this one.
-                        continue
-
-                    if verbose:
-                        print(f"Splitting {tied_module_name}.")
-                    tied_module_children = list(tied_module.named_parameters(recurse=False)) + tied_module_children
-                    tied_module_children = [(f"{tied_module_name}.{n}", v) for n, v in tied_module_children]
-                    tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n == tied_module_name][0]
-
-                    modules_to_treat = (
-                        [(name, module)]
-                        + modules_to_treat[:tied_module_index]
-                        + tied_module_children
-                        + modules_to_treat[tied_module_index + 1 :]
-                    )
-                    # Update the max layer size.
-                    max_layer_size, max_layer_names = get_max_layer_size(
-                        [(n, m) for n, m in modules_to_treat if isinstance(m, nn.Module)],
-                        module_sizes,
-                        no_split_module_classes,
-                    )
-                    split_happened = True
-                    break
-
-                if not split_happened:
-                    # If the tied module is not split, we go to the next device
-                    if verbose:
-                        print("None of the tied module can be split, going to the next device.")
-
-                    device_memory_used[device] = current_memory_used + current_memory_reserved
-                    current_device += 1
-                    modules_to_treat = [(name, module)] + modules_to_treat
-                    current_memory_used = 0
-
-        else:
-            if verbose:
-                if current_max_size is None:
-                    print(f"Putting {name} (size={module_size}) on {devices[current_device]}.")
-                else:
-                    print(
-                        f"Putting {name} (size={module_size}) on {devices[current_device]} "
-                        f"(available={current_max_size - current_memory_used})."
-                    )
-            current_memory_used += module_size
-            device_memory_used[device] = current_memory_used + current_memory_reserved
-            device_map[name] = devices[current_device]
-
-            if not offload_buffers and isinstance(module, nn.Module):
-                current_buffer_size = compute_module_total_buffer_size(
-                    module, dtype=dtype, special_dtypes=special_dtypes
-                )
-                device_buffer_sizes[device] = device_buffer_sizes.get(device, 0) + current_buffer_size
-
-    if clean_result:
-        device_map = clean_device_map(device_map)
-
-    non_gpu_buffer_size = device_buffer_sizes.get("cpu", 0) + device_buffer_sizes.get("disk", 0)
-    if non_gpu_buffer_size > 0 and not offload_buffers:
-        is_buffer_fit_any_gpu = False
-        for gpu_device, gpu_max_memory in max_memory.items():
-            if gpu_device in ('cpu', 'disk'):
-                continue
-
-            if not is_buffer_fit_any_gpu:
-                gpu_memory_used = device_memory_used.get(gpu_device, 0)
-
-                if gpu_max_memory >= non_gpu_buffer_size + gpu_memory_used:
-                    is_buffer_fit_any_gpu = True
-
-        if len(gpus) > 0 and not is_buffer_fit_any_gpu:
-            warnings.warn(
-                f"Current model requires {non_gpu_buffer_size} bytes of buffer for offloaded layers, which seems does "
-                f"not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using "
-                f"offload_buffers=True."
-            )
-
-    return device_map
-
-
-def compute_module_sizes(
-    model: nn.Module,
-    dtype: Optional[str] = None,
-    special_dtypes: Optional[Dict[str, str]] = None,
-    buffers_only: bool = False,
-):
-    """
-    Compute the size of each submodule of a given model.
-    """
-    if dtype is not None:
-        dtype = _get_proper_dtype(dtype)
-        dtype_size = dtype_byte_size(dtype)
-    if special_dtypes is not None:
-        special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
-        special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
-    module_sizes = defaultdict(int)
-
-    module_list = []
-
-    if not buffers_only:
-        module_list = named_module_tensors(model, recurse=True)
-    else:
-        module_list = model.named_buffers(recurse=True)
-
-    for name, tensor in module_list:
-        if special_dtypes is not None and name in special_dtypes:
-            size = tensor.numel() * special_dtypes_size[name]
-        elif dtype is None:
-            size = tensor.numel() * dtype_byte_size(tensor.dtype)
-        elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
-            # According to the code in set_module_tensor_to_device, these types won't be converted
-            # so use their original size here
-            size = tensor.numel() * dtype_byte_size(tensor.dtype)
-        else:
-            size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
-        name_parts = name.split(".")
-        for idx in range(len(name_parts) + 1):
-            module_sizes[".".join(name_parts[:idx])] += size
-
-    return module_sizes
-
-def compute_module_total_buffer_size(
-    model: nn.Module,
-    dtype: Optional[str] = None,
-    special_dtypes: Optional[Dict[str, str]] = None,
-):
-    """
-    Compute the total size of buffers in each submodule of a given model.
-    """
-    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes, buffers_only=True)
-    return module_sizes.get("", 0)
-
-
-def get_max_layer_size(
-    modules: List[Tuple[str, nn.Module]], module_sizes: Dict[str, int], no_split_module_classes: List[str]
-):
-    """
-    Utility function that will scan a list of named modules and return the maximum size used by one full layer. The
-    definition of a layer being:
-    - a module with no direct children (just parameters and buffers)
-    - a module whose class name is in the list `no_split_module_classes`
-
-    Args:
-        modules (`List[Tuple[str, nn.Module]]`):
-            The list of named modules where we want to determine the maximum layer size.
-        module_sizes (`Dict[str, int]`):
-            A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`).
-        no_split_module_classes (`List[str]`):
-            A list of class names for layers we don't want to be split.
-
-    Returns:
-        `Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names realizing that maximum size.
-    """
-    max_size = 0
-    layer_names = []
-    modules_to_treat = modules.copy()
-    while len(modules_to_treat) > 0:
-        module_name, module = modules_to_treat.pop(0)
-        modules_children = list(module.named_children()) if isinstance(module, nn.Module) else []
-        if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
-            # No splitting this one so we compare to the max_size
-            size = module_sizes[module_name]
-            if size > max_size:
-                max_size = size
-                layer_names = [module_name]
-            elif size == max_size:
-                layer_names.append(module_name)
-        else:
-            modules_to_treat = [(f"{module_name}.{n}", v) for n, v in modules_children] + modules_to_treat
-    return max_size, layer_names
-
-
-def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None):
-    """
-    Get the maximum memory available if nothing is passed, converts string to int otherwise.
-    """
-    if max_memory is None:
-        max_memory = {}
-        try:
-            group_size = get_group_size()
-        except:
-            group_size = 1
-        device_target = mindspore.get_context('device_target')
-        if device_target == 'GPU':
-            devices_free_memory = get_gpus_free_memory()
-            visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
-            export_command = "export CUDA_VISIBLE_DEVICES=0,1,2,3"
-        elif device_target == 'Ascend':
-            devices_free_memory = get_npus_free_memory()
-            visible_devices = os.environ.get('ASCEND_RT_VISIBLE_DEVICES', None)
-            export_command = "export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3"
-
-        if visible_devices is not None:
-            visible_devices = visible_devices.split(',')
-            if len(visible_devices) < group_size:
-                raise RuntimeError(f'There are {group_size} process with only {len(visible_devices)} visible devices, '\
-                                   f'please use `{export_command}` to set enough devices')
-            if len(visible_devices) > group_size:
-                visible_devices = visible_devices[:group_size]
-        else:
-            visible_devices = list(range(group_size))
-
-        if group_size != len(visible_devices):
-            raise ValueError('The number of process must be equal to visible devices, but got '
-                             f'{group_size} process and {len(visible_devices)} devices.')
-        for i in visible_devices:
-            i = int(i)
-            max_memory[i] = convert_file_size_to_int(f'{devices_free_memory[i][1]}MIB')
-
-        return max_memory
-
-    for key in max_memory:
-        if isinstance(max_memory[key], str):
-            max_memory[key] = convert_file_size_to_int(max_memory[key])
-
-    # Need to sort the device by type to make sure that we allocate the gpu first.
-    # As gpu/npu/xpu are represented by int, we need to sort them first.
-    gpu_devices = [k for k in max_memory.keys() if isinstance(k, int)]
-    gpu_devices.sort()
-    # check if gpu/npu/xpu devices are available and if not, throw a warning
-    num_devices = mindspore.hal.device_count()
-    for device in gpu_devices:
-        if device >= num_devices or device < 0:
-            logger.warning(f"Device {device} is not available, available devices are {list(range(num_devices))}")
-    # Add the other devices in the preset order if they are available
-    all_devices = gpu_devices + [k for k in ["mps", "cpu", "disk"] if k in max_memory.keys()]
-    # Raise an error if a device is not recognized
-    for k in max_memory.keys():
-        if k not in all_devices:
-            raise ValueError(
-                f"Device {k} is not recognized, available devices are integers(for GPU/XPU), 'mps', 'cpu' and 'disk'"
-            )
-    max_memory = {k: max_memory[k] for k in all_devices}
-
-    return max_memory
-
-
-def _get_param_device(param, device_map):
-    if param in device_map:
-        return device_map[param]
-    parent_param = ".".join(param.split(".")[:-1])
-    if parent_param == param:
-        raise ValueError(f"The `device_map` does not contain the module {param}.")
-    else:
-        return _get_param_device(parent_param, device_map)
-
-
-def check_tied_parameters_on_same_device(tied_params, device_map):
-    """
-    Check if tied parameters are on the same device
-
-    Args:
-        tied_params (`List[List[str]]`):
-            A list of lists of parameter names being all tied together.
-
-        device_map (`Dict[str, Union[int, str, torch.device]]`):
-            A map that specifies where each submodule should go.
-
-    """
-    for tie_param in tied_params:
-        tie_param_devices = {}
-        for param in tie_param:
-            tie_param_devices[param] = _get_param_device(param, device_map)
-        if len(set(tie_param_devices.values())) > 1:
-            logger.warn(
-                f"Tied parameters are on different devices: {tie_param_devices}. "
-                "Please modify your custom device map or set `device_map='auto'`. "
-            )
-
-
-
-def _get_named_modules(
-    module: nn.Module,
-    memo: Optional[Set[nn.Module]] = None,
-    prefix: str = "",
-    remove_duplicate: bool = True,
-):
-    """
-    Return an iterator over all modules in the network, yielding both the name of the module as well as the module
-    itself. Copied from PyTorch `nn.Module.named_modules` for compatability with torch < 2.0 versions with
-    `remove_duplicate` option added.
-
-    Args:
-        memo (set of `nn.Module`, *optional*):
-            A memo to store the set of modules already added to the result
-        prefix (`str`, *optional*):
-            A prefix that will be added to the name of the module
-        remove_duplicate (`bool`, *optional*):
-            Whether to remove the duplicated module instances in the result or not
-
-    Yields:
-        (str, Module): Tuple of name and module
-
-    Note:
-        Duplicate modules are returned only once. In the following example, ``l`` will be returned only once.
-    """
-    if memo is None:
-        memo = set()
-    if module not in memo:
-        if remove_duplicate:
-            memo.add(module)
-        yield prefix, module
-        for name, sub_module in module._modules.items():
-            if module is None:
-                continue
-            submodule_prefix = prefix + ("." if prefix else "") + name
-            yield from _get_named_modules(sub_module, memo, submodule_prefix, remove_duplicate)
-
-
-def _get_named_parameters(module: nn.Module, prefix="", recurse=True, remove_duplicate: bool = True):
-    """
-    Help yield various names + members of modules. Copied from PyTorch `nn.Module.named_modules` for
-    compatability with torch < 2.0 versions with `remove_duplicate` option added.
-    """
-    memo = set()
-    modules = (
-        _get_named_modules(module, prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)]
-    )
-    for module_prefix, module in modules:
-        members = module._parameters.items()
-        for k, v in members:
-            if v is None or v in memo:
-                continue
-            if remove_duplicate:
-                memo.add(v)
-            name = module_prefix + ("." if module_prefix else "") + k
-            yield name, v
-
-def find_tied_parameters(model: nn.Module, **kwargs):
-    """
-    Find the tied parameters in a given model.
-
-    <Tip warning={true}>
-
-    The signature accepts keyword arguments, but they are for the recursive part of this function and you should ignore
-    them.
-
-    </Tip>
-
-    Args:
-        model (`nn.Module`): The model to inspect.
-
-    Returns:
-        List[List[str]]: A list of lists of parameter names being all tied together.
-
-    Example:
-
-    ```py
-    >>> from collections import OrderedDict
-    >>> import nn as nn
-
-    >>> model = nn.Sequential(OrderedDict([("linear1", nn.Linear(4, 4)), ("linear2", nn.Linear(4, 4))]))
-    >>> model.linear2.weight = model.linear1.weight
-    >>> find_tied_parameters(model)
-    [['linear1.weight', 'linear2.weight']]
-    ```
-    """
-
-    # get ALL model parameters and thier names
-    all_named_parameters = {name: param for name, param in _get_named_parameters(model, remove_duplicate=False)}
-
-    # get ONLY unique named parameters,
-    # if parameter is tied and have multiple names, it will be included only once
-    no_duplicate_named_parameters = {
-        name: param for name, param in _get_named_parameters(model, remove_duplicate=True)
-    }
-
-    # the difference of the two sets will give us the tied parameters
-    tied_param_names = set(all_named_parameters.keys()) - set(no_duplicate_named_parameters.keys())
-
-    # 'tied_param_names' contains the names of parameters that are tied in the model, but we do not know
-    # which names refer to the same parameter. To identify this, we need to group them together.
-    tied_param_groups = {}
-    for tied_param_name in tied_param_names:
-        tied_param = all_named_parameters[tied_param_name]
-        for param_name, param in no_duplicate_named_parameters.items():
-            # compare if parameters are the same, if so, group thier names together
-            if param is tied_param:
-                if param_name not in tied_param_groups:
-                    tied_param_groups[param_name] = []
-                tied_param_groups[param_name].append(tied_param_name)
-
-    return FindTiedParametersResult([sorted([weight] + list(set(tied))) for weight, tied in tied_param_groups.items()])
-
-
-class FindTiedParametersResult(list):
-    """
-    This is a subclass of a list to handle backward compatibility for Transformers. Do not rely on the fact this is not
-    a list or on the `values` method as in the future this will be removed.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def values(self):
-        # TODO: at the next Transformers release (4.28.0) issue a deprecation warning here.
-        return sum([x[1:] for x in self], [])
-
-
-def check_tied_parameters_in_config(model: nn.Module):
-    """
-    Check if there is any indication in the given model that some weights should be tied.
-
-    Args:
-        model (`torch.nn.Module`): The model to inspect
-
-    Returns:
-        bool: True if the model needs to have tied weights
-    """
-
-    # based on model.tie_weights() method
-    has_tied_word_embedding = False
-    has_tied_encoder_decoder = False
-    has_tied_module = False
-
-    if "PreTrainedModel" in [c.__name__ for c in inspect.getmro(model.__class__)]:
-        has_tied_word_embedding = (
-            hasattr(model, "config")
-            and getattr(model.config, "tie_word_embeddings", False)
-            and model.get_output_embeddings()
-        )
-        has_tied_encoder_decoder = (
-            hasattr(model, "config")
-            and getattr(model.config, "is_encoder_decoder", False)
-            and getattr(model.config, "tie_encoder_decoder", False)
-        )
-        has_tied_module = any(hasattr(module, "_tie_weights") for module in model.modules())
-
-    return any([has_tied_word_embedding, has_tied_encoder_decoder, has_tied_module])
-
-
-def retie_parameters(model, tied_params):
-    """
-    Reties tied parameters in a given model if the link was broken (for instance when adding hooks).
-
-    Args:
-        model (`nn.Module`):
-            The model in which to retie parameters.
-        tied_params (`List[List[str]]`):
-            A mapping parameter name to tied parameter name as obtained by `find_tied_parameters`.
-    """
-    for tied_group in tied_params:
-        param_to_tie = None
-        # two loops : the first one to set param_to_tie , the second one to change the values of tied_group
-        for param_name in tied_group:
-            module = model
-            splits = param_name.split(".")
-            for split in splits[:-1]:
-                module = getattr(module, split)
-            param = getattr(module, splits[-1])
-            if param_to_tie is None:
-                param_to_tie = param
-                break
-        if param_to_tie is not None:
-            for param_name in tied_group:
-                module = model
-                splits = param_name.split(".")
-                for split in splits[:-1]:
-                    module = getattr(module, split)
-                setattr(module, splits[-1], param_to_tie)
-
-def _get_proper_dtype(dtype):
-    """
-    Just does mindspore.dtype.TensorType(dtype) if necessary.
-    """
-    if isinstance(dtype, str):
-        # We accept "mindspore.float16" or just "float16"
-        dtype = dtype.replace("mindspore.", "")
-        dtype = getattr(mindspore, dtype)
-    return dtype
-
-
-def named_module_tensors(
-    module: nn.Module, include_buffers: bool = True, recurse: bool = False, remove_non_persistent: bool = False
-):
-    """
-    A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True`
-    it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`.
-
-    Args:
-        module (`nn.Module`):
-            The module we want the tensors on.
-        include_buffer (`bool`, *optional*, defaults to `True`):
-            Whether or not to include the buffers in the result.
-        recurse (`bool`, *optional`, defaults to `False`):
-            Whether or not to go look in every submodule or just return the direct parameters and buffers.
-        remove_non_persistent (`bool`, *optional*, defaults to `False`):
-            Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers =
-            True
-    """
-    yield from module.named_parameters(recurse=recurse)
-
-    if include_buffers:
-        non_persistent_buffers = set()
-        if remove_non_persistent:
-            non_persistent_buffers = get_non_persistent_buffers(module, recurse=recurse)
-        for named_buffer in module.named_buffers(recurse=recurse):
-            name, _ = named_buffer
-            if name not in non_persistent_buffers:
-                yield named_buffer
-
-
-def get_non_persistent_buffers(module: nn.Module, recurse: bool = False):
-    """
-    Gather all non persistent buffers of a given modules into a set
-
-    Args:
-        module (`nn.Module`):
-            The module we want the non persistent buffers on.
-        recurse (`bool`, *optional*, defaults to `False`):
-            Whether or not to go look in every submodule or just return the direct non persistent buffers.
-    """
-
-    non_persistent_buffers_set = module._non_persistent_buffers_set
-    if recurse:
-        for _, m in module.named_modules():
-            non_persistent_buffers_set |= m._non_persistent_buffers_set
-
-    return non_persistent_buffers_set
-
-def clean_device_map(device_map: Dict[str, Union[int, str]], module_name: str = ""):
-    """
-    Cleans a device_map by grouping all submodules that go on the same device together.
-    """
-    # Get the value of the current module and if there is only one split across several keys, regroup it.
-    prefix = "" if module_name == "" else f"{module_name}."
-    values = [v for k, v in device_map.items() if k.startswith(prefix)]
-    if len(set(values)) == 1 and len(values) > 1:
-        for k in [k for k in device_map if k.startswith(prefix)]:
-            del device_map[k]
-        device_map[module_name] = values[0]
-
-    # Recurse over the children
-    children_modules = [k for k in device_map.keys() if k.startswith(prefix) and len(k) > len(module_name)]
-    idx = len(module_name.split(".")) + 1 if len(module_name) > 0 else 1
-    children_modules = set(".".join(k.split(".")[:idx]) for k in children_modules)
-    for child in children_modules:
-        clean_device_map(device_map, module_name=child)
-
-    return device_map
-
-def modify_model_for_pp_infer(model: nn.Module, device_map, no_split_module_classes):
-    current_device = get_rank()
-    last_device = get_group_size() - 1
-    reversed_device_map = {}
-    for scope_name, device in device_map.items():
-        if device not in reversed_device_map:
-            reversed_device_map[device] = [scope_name]
-        else:
-            reversed_device_map[device].append(scope_name)
-
-        if device != current_device:
-            submodule = model.get_submodule(scope_name)
-            if isinstance(submodule, nn.Embedding):
-                new_embedding = EmbeddingIndentity(submodule.num_embeddings, submodule.embedding_dim, model.dtype)
-                replace_submodule(model, scope_name, new_embedding)
-            elif isinstance(submodule, nn.Linear):
-                new_linear = LinearIndetity(submodule.in_features, submodule.out_features, model.dtype)
-                replace_submodule(model, scope_name, new_linear)
-            elif submodule.__class__.__name__ in no_split_module_classes:
-                new_layer = DecoderLayerIdentity(submodule.self_attn.layer_idx, submodule.self_attn.config)
-                replace_submodule(model, scope_name, new_layer)
-            else:
-                # new_layer = nn.Identity()
-                # replace_submodule(model, scope_name, new_layer)
-                pass
-
-    if current_device < last_device:
-        current_last_layer = model.get_submodule(reversed_device_map[current_device][-1])
-        current_last_layer._forward = current_last_layer.forward
-        current_last_layer.forward = types.MethodType(send_forward, current_last_layer)
-        current_last_layer.dist = current_device + 1
-
-    if current_device > 0:
-        current_first_layer = model.get_submodule(reversed_device_map[current_device][0])
-        current_first_layer._forward = current_first_layer.forward
-        current_first_layer.forward = types.MethodType(receive_forward, current_first_layer)
-        current_first_layer.src = current_device - 1
-
-    model_last_layer = model.get_submodule(next(reversed(device_map)))
-    model_last_layer._forward = model_last_layer.forward
-    model_last_layer.forward = types.MethodType(broadcast_forward, model_last_layer)
-    model_last_layer.src = last_device
-
-    return model
-
-def find_usefull_files(shared_files, shared_meta, model_params):
-    files_path = '/'.join(shared_files[0].split('/')[:-1])
-    usefull_files = set()
-    loaded_keys = []
-    for param_name, file_name in shared_meta['weight_map'].items():
-        if param_name in model_params:
-            usefull_files.add(file_name)
-            loaded_keys.append(param_name)
-
-    usefull_files = [files_path + '/' + file for file in usefull_files]
-
-    return usefull_files, loaded_keys
-
-
-def replace_submodule(model, submodule_path, new_module):
-    parent_path, _, child_name = submodule_path.rpartition('.')
-
-    parent_module = model.get_submodule(parent_path) if parent_path else model
-
-    setattr(parent_module, child_name, new_module)
-
-def send_forward(self, *args, **kwargs):
-    output = self._forward(*args, **kwargs)
-    isend(output[0], self.dist)
-    return output
-
-def receive_forward(self, *args, **kwargs):
-    hidden_states = args[0]
-    hidden_states = irecv(hidden_states, src=self.src)
-    output = self._forward(*((hidden_states,) + args[1:]), **kwargs)
-    return output
-
-def broadcast_forward(self, *args, **kwargs):
-    output = self._forward(*args, **kwargs)
-    output = broadcast(output, src=self.src)
-    return output
-
-
-class DecoderLayerIdentity(nn.Module):
-    def __init__(self, layer_idx, config):
-        super().__init__()
-        self.layer_idx = layer_idx
-        self.num_key_value_heads = config.num_key_value_heads
-
-    def forward(self, *args, **kwargs):
-        output_attentions = kwargs.get('output_attentions', False)
-        use_cache = kwargs.get('use_cache', False)
-        past_key_value = kwargs.get('past_key_value', None)
-        hidden_states = args[0]
-        bs, seq_len, _ = hidden_states.shape
-        if output_attentions:
-            logger.warning('`output_attentions` should set to `False` during multi-process inference.')
-
-        if past_key_value is not None:
-            past_key_value.update(
-                ops.empty(bs, self.num_key_value_heads, seq_len, 0),
-                ops.empty(bs, self.num_key_value_heads, seq_len, 0),
-                self.layer_idx)
-
-        output = (hidden_states,)
-
-        if output_attentions:
-            output = output + (None,)
-
-        if use_cache:
-            output = output + (past_key_value,)
-
-        output = output + (None,)
-        return output
-
-
-class EmbeddingIndentity(nn.Module):
-    def __init__(self, num_embeddings: int, embedding_dim: int, dtype=None):
-        super().__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        self.dtype = dtype
-
-    def forward(self, input):
-        return ops.empty(input.shape + (self.embedding_dim,), dtype=self.dtype)
-
-class LinearIndetity(nn.Module):
-    def __init__(self, in_features, out_features, dtype=None):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.dtype = dtype
-
-    def forward(self, input):
-        return ops.empty(input.shape[:-1] + (self.out_features,), dtype=self.dtype)
diff --git a/mindnlp/accelerate/utils/other.py b/mindnlp/accelerate/utils/other.py
deleted file mode 100644
index ad537d1e9..000000000
--- a/mindnlp/accelerate/utils/other.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""accelerate utilities."""
-from ..state import PartialState
-
-
-def wait_for_everyone():
-    """
-    Introduces a blocking point in the script, making sure all processes have reached this point before continuing.
-
-    <Tip warning={true}>
-
-    Make sure all processes will reach this instruction otherwise one of your processes will hang forever.
-
-    </Tip>
-    """
-    PartialState().wait_for_everyone()
diff --git a/mindnlp/amp.py b/mindnlp/amp.py
deleted file mode 100644
index c2bb57ab6..000000000
--- a/mindnlp/amp.py
+++ /dev/null
@@ -1,214 +0,0 @@
-"""Operator level amp"""
-import functools
-from typing import Any
-
-import mindspore
-
-CELL_WHITE_LIST = [
-    'Dense',
-    'Conv1d',
-    'Conv2d',
-    'Conv3d',
-]
-
-OP_WHITE_LIST = [
-    'MatMul',
-    'BatchMatMul',
-    'Dense',
-    'Conv2D',
-    'Conv2DTranspose',
-    'Conv3D',
-    'Conv3DTranspose',
-    'LSTM',
-    'CudnnGRU',
-    'PReLU'
-]
-
-OP_BLACK_LIST = [
-    'Asin',
-    'Acos',
-    'BCEWithLogitsLoss',
-    'BinaryCrossEntropy',
-    'Cosh',
-    'Cdis',
-    'CumProd',
-    'CumSum',
-    'Div',
-    'Erfinv',
-    'Exp',
-    'Expm1',
-    'KLDivLoss',
-    'LayerNorm',
-    'Log',
-    'LogSoftmax',
-    'Log10',
-    'Log1p',
-    'Log2',
-    'MultilabelMarginLoss',
-    'MultiMarginLoss',
-    'NLLLoss',
-    'LpNorm',
-    'L2Normalize',
-    'Pdist',
-    'Pow',
-    'RealDiv',
-    'ReduceProd',
-    'Reciprocal',
-    'Rsqrt',
-    'Renorm',
-    'Sinh',
-    'Sum',
-    'Softplus',
-    'Softmax',
-    'Softmin',
-    'SoftMarginLoss',
-    'SoftmaxCrossEntropyWithLogits',
-    'SparseSoftmaxCrossEntropyWithLogits',
-    'SmoothL1Loss',
-    'Tan',
-    'TripletMarginLoss'
-]
-
-GLOBAL_AMP = False
-GLOBAL_AMP_DTYPE = mindspore.float32
-
-def _set_amp(mode, dtype):
-    r"""
-    Sets the global amplifier mode and data type.
-    
-    Args:
-        mode (str): The mode to set the global amplifier to. Valid values are 'on' or 'off'.
-        dtype (type): The data type to set for the global amplifier. This can be any valid Python data type.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    global GLOBAL_AMP
-    global GLOBAL_AMP_DTYPE
-    GLOBAL_AMP = mode
-    GLOBAL_AMP_DTYPE = dtype
-
-def get_global_amp():
-    r"""
-    Returns the global amplitude and its data type.
-    
-    Returns:
-        tuple: A tuple containing the global amplitude and its data type.
-    """
-    return GLOBAL_AMP, GLOBAL_AMP_DTYPE
-
-
-def autocast_decorator(autocast_instance, func):
-    r"""
-    Decorator function that applies an autocast instance to a given function.
-    
-    Args:
-        autocast_instance (Autocast): An instance of the Autocast class.
-            The Autocast class provides a context manager that automatically casts inputs to a specified data type.
-            This autocast instance will be used to automatically cast the inputs of the decorated function.
-        func (function): The function to be decorated.
-            This function will be called within the context of the autocast instance.
-    
-    Returns:
-        None
-    
-    Raises:
-        None
-    """
-    @functools.wraps(func)
-    def decorate_autocast(*args, **kwargs):
-        with autocast_instance:
-            return func(*args, **kwargs)
-
-    return decorate_autocast
-
-class autocast:
-
-    r"""
-    The 'autocast' class represents a context manager for automatic mixed precision (AMP) in Python. It provides functionality for enabling or disabling automatic mixed precision for a specific code block and
-specifying the data type for computations.
-    
-    Upon entering the context, the 'autocast' class sets the enabled state and data type for AMP. Upon exiting the context, it restores the original data type. Additionally, the class can be used as a
-decorator for functions to apply automatic mixed precision to the decorated function.
-    
-    This class is designed to be used in conjunction with the MindSpore framework for deep learning and neural network computations.
-    """
-    def __init__(
-        self,
-        enabled: bool = True,
-        dtype = mindspore.float16,
-    ):
-        r"""
-        Initialize the autocast object.
-        
-        Args:
-            self (object): The instance of the autocast class.
-            enabled (bool, optional): A flag indicating whether autocast is enabled. Defaults to True.
-            dtype (dtype, optional): The data type for autocasting. Defaults to mindspore.float16.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        self.enabled = enabled
-        self.dtype = dtype
-        self.old_dtype = GLOBAL_AMP_DTYPE
-
-    def __enter__(self):
-        r"""
-        Method '__enter__' in the class 'autocast'.
-        
-        Args:
-            self: autocast instance.
-                Represents the current instance of the autocast class.
-        
-        Returns:
-            None.
-            The method does not explicitly return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        _set_amp(self.enabled, self.dtype)
-        return self
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):
-        r"""
-        This method is called when exiting a context managed by the 'autocast' class.
-        
-        Args:
-            self: Instance of the 'autocast' class.
-            exc_type: Type of the exception being handled.
-            exc_val: Value of the exception being handled.
-            exc_tb: Traceback of the exception being handled.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions explicitly. 
-            However, exceptions may be raised during the execution of '_set_amp' function called within this method.
-        """
-        _set_amp(self.enabled, self.old_dtype)
-        return False
-
-    def __call__(self, func):
-        r"""
-        Executes the '__call__' method of the 'autocast' class.
-        
-        Args:
-            self (autocast): An instance of the 'autocast' class.
-            func (function): The function to be decorated.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return autocast_decorator(self, func)
diff --git a/mindnlp/common/__init__.py b/mindnlp/common/__init__.py
deleted file mode 100644
index 2a0b8fcfe..000000000
--- a/mindnlp/common/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""
-common modules for all submodule(transformers, mimm, peft, trl, diffusers, etc), include:
-activations, optimization, etc.
-"""
diff --git a/mindnlp/common/activations.py b/mindnlp/common/activations.py
deleted file mode 100644
index 40e665df7..000000000
--- a/mindnlp/common/activations.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MindNLP Activations"""
-import math
-from collections import OrderedDict
-from mindspore import Tensor
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-def gelu_tanh(x: Tensor, inplace: bool = False) -> Tensor:
-    return F.gelu(x, approximate='tanh')
-
-def quick_gelu(x: Tensor, inplace: bool = False) -> Tensor:
-    return x * ops.sigmoid(1.702 * x)
-
-def hard_mish(x, inplace: bool = False):
-    """ Hard Mish
-    Experimental, based on notes by Mish author Diganta Misra at
-      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
-    """
-    if inplace:
-        return x.mul(0.5 * (x + 2).clamp(min=0, max=2))
-    else:
-        return 0.5 * x * (x + 2).clamp(min=0, max=2)
-
-
-class HardMish(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardMish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return hard_mish(x, self.inplace)
-
-class GELUTanh(nn.Module):
-    """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
-    """
-    def __init__(self, inplace: bool = False):
-        super(GELUTanh, self).__init__()
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.gelu(input, approximate='tanh')
-
-class QuickGELU(nn.Module):
-    """
-    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
-    """
-    def forward(self, input: Tensor) -> Tensor:
-        r"""
-        forwards the QuickGELU activation function.
-        """
-        return quick_gelu(input)
-
-
-class ClippedGELUActivation(nn.Module):
-    """
-    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
-    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
-    https://arxiv.org/abs/2004.09602.
-
-    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
-    initially created.
-
-    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
-    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
-    """
-    def __init__(self, min: float, max: float):
-        r"""
-        Initializes an instance of the ClippedGELUActivation class.
-        
-        Args:
-            self: The instance of the ClippedGELUActivation class.
-            min (float): The minimum value for clipping.
-                The value of 'min' should be less than 'max'.
-            max (float): The maximum value for clipping.
-                The value of 'max' should be greater than 'min'.
-        
-        Returns:
-            None.
-        
-        Raises:
-            ValueError: If 'min' is greater than 'max', a ValueError is raised with a detailed error message.
-        """
-        if min > max:
-            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
-
-        super().__init__()
-        self.min = min
-        self.max = max
-
-    def forward(self, x: Tensor) -> Tensor:
-        r"""
-        forwards a ClippedGELUActivation function with input clipping.
-        
-        Args:
-            self: ClippedGELUActivation
-                The instance of the ClippedGELUActivation class.
-        
-            x: Tensor
-                The input tensor to the activation function.
-        
-        Returns:
-            Tensor:
-                The tensor resulting from applying the ClippedGELUActivation function to the input tensor,
-                with values clipped to the range [min, max].
-        
-        Raises:
-            None
-        """
-        return ops.clip(gelu(x), self.min, self.max)
-
-
-class AccurateGELUActivation(nn.Module):
-    """
-    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
-    https://github.com/hendrycks/GELUs
-
-    Implemented along with MEGA (Moving Average Equipped Gated Attention)
-    """
-    def __init__(self):
-        r"""
-        Initializes an instance of the AccurateGELUActivation class.
-        
-        Args:
-            self: The instance of the class itself.
-        
-        Returns:
-            None.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        self.precomputed_constant = math.sqrt(2 / math.pi)
-
-    def forward(self, input: Tensor) -> Tensor:
-        r"""
-        This method 'forward' is responsible for applying the Accurate Gaussian Error Linear Unit (GELU)
-        activation function to the input tensor.
-        
-        Args:
-            self (AccurateGELUActivation): The instance of the AccurateGELUActivation class.
-            input (Tensor):
-                The input tensor on which the GELU activation function will be applied.
-                It represents the input values to be transformed. It should be a tensor of numerical values.
-        
-        Returns:
-            Tensor:
-                A tensor of the same shape as the input tensor, containing the output values after applying the
-                Accurate GELU activation function.
-                The transformed tensor represents the non-linearity applied to the input tensor.
-        
-        Raises:
-            TypeError: If the input tensor is not of type Tensor.
-            ValueError: If the dimensions of the input tensor are not compatible with the operations within the method.
-            RuntimeError: If there is an issue during the computation of the GELU activation function.
-        """
-        return 0.5 * input * (1 + ops.tanh(self.precomputed_constant * (input + 0.044715 * ops.pow(input, 3))))
-
-
-class MishActivation(nn.Module):
-    """
-    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
-    visit the official repository for the paper: https://github.com/digantamisra98/Mish
-    """
-    def forward(self, input: Tensor) -> Tensor:
-        r"""
-        forwards a Mish activation function on the input tensor.
-        
-        Args:
-            self (MishActivation): An instance of the MishActivation class.
-            input (Tensor): The input tensor to apply the activation function on.
-        
-        Returns:
-            Tensor: The tensor with the Mish activation function applied.
-        
-        Raises:
-            None.
-        
-        The Mish activation function is defined as the element-wise product of the input tensor and
-        the hyperbolic tangent of the softplus function applied to the input tensor.
-        This activation function introduces a non-linearity that helps in capturing more complex patterns in the data.
-        
-        Note:
-            - The input tensor should have a shape that is compatible with the activation function.
-        """
-        return input * ops.tanh(ops.softplus(input))
-
-
-class LinearActivation(nn.Module):
-    """
-    Applies the linear activation function, i.e. forwarding input directly to output.
-    """
-    def forward(self, input: Tensor) -> Tensor:
-        r"""
-        forward method in the LinearActivation class.
-        
-        Args:
-            self (object): The instance of the LinearActivation class.
-            input (Tensor): The input tensor to be processed.
-        
-        Returns:
-            Tensor: The processed tensor as per the implementation.
-        
-        Raises:
-            None.
-        """
-        return input
-
-
-class LaplaceActivation(nn.Module):
-    """
-    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
-    https://arxiv.org/abs/2209.10655
-
-    Inspired by squared relu, but with bounded range and gradient for better stability
-    """
-    def forward(self, input, mu=0.707107, sigma=0.282095):
-        r"""
-        This method 'forward' in the class 'LaplaceActivation' performs a Laplace activation function transformation
-        on the input data.
-        
-        Args:
-            self (object): The instance of the class.
-            input (tensor): The input data to be transformed using the Laplace activation function.
-            mu (float, optional): The mean value used for normalization. Default is 0.707107.
-            sigma (float, optional): The standard deviation value used for normalization. Default is 0.282095.
-        
-        Returns:
-            None.
-        
-        Raises:
-            ValueError: If the input data is not a valid tensor.
-            TypeError: If the input data or the normalization parameters are of incorrect types.
-            ZeroDivisionError: If sigma is set to zero, resulting in division by zero.
-        """
-        input = (input - mu).div(sigma * math.sqrt(2.0))
-        return 0.5 * (1.0 + ops.erf(input))
-
-
-class ReLUSquaredActivation(nn.Module):
-    """
-    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
-    """
-    def forward(self, input):
-        r"""
-        forwards the ReLU squared activation of the input.
-        
-        Args:
-            self (object): Instance of the ReLUSquaredActivation class.
-            input (numeric): The input value to be processed by the activation function.
-        
-        Returns:
-            None: This method returns None as it updates the internal state of the object.
-        
-        Raises:
-            None.
-        """
-        # relu_applied = ops.relu(input)
-        relu_applied = nn.functional.relu(input)
-        squared = ops.square(relu_applied)
-        return squared
-
-
-class ClassInstantier(OrderedDict):
-    r"""
-    Class Instantier
-    """
-    def __getitem__(self, key):
-        r"""
-        Retrieve an item from the ClassInstantier object using the specified key.
-        
-        Args:
-            self (ClassInstantier): The ClassInstantier object itself.
-            key: The key used to retrieve the item from the object.
-        
-        Returns:
-            None.
-        
-        Raises:
-            None.
-        """
-        content = super().__getitem__(key)
-        cls, kwargs = content if isinstance(content, tuple) else (content, {})
-        return cls(**kwargs)
-
-
-ACT2CLS = {
-    """
-    Excitation equation matrix
-    """
-    'relu': nn.ReLU,
-    'gelu': nn.GELU,
-    'gelu_new': (nn.GELU, {'approximate': 'tanh'}),
-    'gelu_approximate': nn.GELU,
-    'gelu_pytorch_tanh': nn.GELU,
-    "swish": nn.SiLU,
-    "gelu_10": nn.GELU,
-    "gelu_fast": (nn.GELU, {'approximate': 'tanh'}),
-    "gelu_python": nn.GELU,
-    "linear": nn.ReLU,
-    "mish": nn.Mish,
-    "quick_gelu": QuickGELU,
-    "relu": nn.ReLU,
-    "relu2": ReLUSquaredActivation,
-    "relu6": nn.ReLU6,
-    "sigmoid": nn.Sigmoid,
-    "silu": nn.SiLU,
-    "tanh": nn.Tanh,
-}
-ACT2FN = ClassInstantier(ACT2CLS)
-
-
-def get_activation(activation_string):
-    """
-    Obtained parameters required for outputting self. activation in the SequenceSummary class
-    :param activation_string:
-    :return:
-    """
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
-
-gelu_python = get_activation("gelu_python")
-gelu_new = get_activation("gelu_new")
-gelu = get_activation("gelu")
-gelu_fast = get_activation("gelu_fast")
-silu = get_activation("silu")
-mish = get_activation("mish")
-linear_act = get_activation("linear")
diff --git a/mindnlp/common/modules/__init__.py b/mindnlp/common/modules/__init__.py
deleted file mode 100644
index f0387de34..000000000
--- a/mindnlp/common/modules/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""modules init"""
-from . import loss, attentions, crf
-from .attentions import ScaledDotProductAttention, DotProductAttention, \
-    BilinearAttention, AdditiveAttention, CosineAttention, \
-    LinearAttention
-from .crf import CRF
-from .loss import RDropLoss, CMRC2018Loss
-
-__all__ = []
-
-__all__.extend(attentions.__all__)
-__all__.extend(crf.__all__)
-__all__.extend(loss.__all__)
diff --git a/mindnlp/common/modules/attentions.py b/mindnlp/common/modules/attentions.py
deleted file mode 100644
index d948e7bf6..000000000
--- a/mindnlp/common/modules/attentions.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""attention module"""
-import math
-from typing import Optional
-
-import mindspore
-from mindnlp.core.nn import Parameter
-from mindnlp.core import nn, ops
-from .utils import masked_softmax, tiny_value_of_dtype, get_combined_dim,  combine_tensors_and_multiply
-
-class Attention(nn.Module):
-    """
-    An `Attention` takes two inputs: a (batched) vector and a matrix, plus an optional mask on the
-    rows of the matrix.  We compute the similarity between the vector and each row in the matrix,
-    and then (optionally) perform a softmax over rows using those computed similarities.
-
-
-    Inputs:
-
-    - vector: shape `(batch_size, embedding_dim)`
-    - matrix: shape `(batch_size, num_rows, embedding_dim)`
-    - matrix_mask: shape `(batch_size, num_rows)`, specifying which rows are just padding.
-
-    Output:
-
-    - attention: shape `(batch_size, num_rows)`.
-
-    # Parameters
-
-    normalize : `bool`, optional (default = `True`)
-        If true, we normalize the computed similarities with a softmax, to return a probability
-        distribution for your attention.  If false, this is just computing a similarity score.
-    """
-
-    def __init__(self, normalize: bool = True) -> None:
-        super().__init__()
-        self._normalize = normalize
-
-    def forward(
-        self, vector: mindspore.Tensor, matrix: mindspore.Tensor, matrix_mask: mindspore.Tensor = None
-    ) -> mindspore.Tensor:
-        similarities = self._forward_internal(vector, matrix)
-        if self._normalize:
-            return masked_softmax(similarities, matrix_mask)
-        else:
-            return similarities
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        raise NotImplementedError
-
-
-class AdditiveAttention(Attention):
-    """
-    Computes attention between a vector and a matrix using an additive attention function.  This
-    function has two matrices `W`, `U` and a vector `V`. The similarity between the vector
-    `x` and the matrix `y` is computed as `V tanh(Wx + Uy)`.
-
-    This attention is often referred as concat or additive attention. It was introduced in
-    [Neural Machine Translation by Jointly Learning to Align and Translate (Bahdanau et al, 2015)]
-    (https://api.semanticscholar.org/CorpusID:11212020).
-
-    Registered as an `Attention` with name "additive".
-
-    # Parameters
-
-    vector_dim : `int`, required
-        The dimension of the vector, `x`, described above.  This is `x.size()[-1]` - the length
-        of the vector that will go into the similarity computation.  We need this so we can build
-        the weight matrix correctly.
-    matrix_dim : `int`, required
-        The dimension of the matrix, `y`, described above.  This is `y.size()[-1]` - the length
-        of the vector that will go into the similarity computation.  We need this so we can build
-        the weight matrix correctly.
-    normalize : `bool`, optional (default = `True`)
-        If true, we normalize the computed similarities with a softmax, to return a probability
-        distribution for your attention.  If false, this is just computing a similarity score.
-    """
-
-    def __init__(self, vector_dim: int, matrix_dim: int, normalize: bool = True) -> None:
-        super().__init__(normalize)
-        self._w_matrix = Parameter(mindspore.Tensor(vector_dim, vector_dim))
-        self._u_matrix = Parameter(mindspore.Tensor(matrix_dim, vector_dim))
-        self._v_vector = Parameter(mindspore.Tensor(vector_dim, 1))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self._w_matrix)
-        nn.init.xavier_uniform_(self._u_matrix)
-        nn.init.xavier_uniform_(self._v_vector)
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        intermediate = vector.matmul(self._w_matrix).unsqueeze(1) + matrix.matmul(self._u_matrix)
-        intermediate = ops.tanh(intermediate)
-        return intermediate.matmul(self._v_vector).squeeze(2)
-
-
-class BilinearAttention(Attention):
-    """
-    Computes attention between a vector and a matrix using a bilinear attention function.  This
-    function has a matrix of weights `W` and a bias `b`, and the similarity between the vector
-    `x` and the matrix `y` is computed as `x^T W y + b`.
-
-    Registered as an `Attention` with name "bilinear".
-
-    # Parameters
-
-    vector_dim : `int`, required
-        The dimension of the vector, `x`, described above.  This is `x.size()[-1]` - the length
-        of the vector that will go into the similarity computation.  We need this so we can build
-        the weight matrix correctly.
-    matrix_dim : `int`, required
-        The dimension of the matrix, `y`, described above.  This is `y.size()[-1]` - the length
-        of the vector that will go into the similarity computation.  We need this so we can build
-        the weight matrix correctly.
-    activation : `Activation`, optional (default=`linear`)
-        An activation function applied after the `x^T W y + b` calculation.  Default is
-        linear, i.e. no activation.
-    normalize : `bool`, optional (default=`True`)
-        If true, we normalize the computed similarities with a softmax, to return a probability
-        distribution for your attention.  If false, this is just computing a similarity score.
-    """
-
-    def __init__(
-        self,
-        vector_dim: int,
-        matrix_dim: int,
-        activation: None,
-        normalize: bool = True,
-    ) -> None:
-        super().__init__(normalize)
-        self._weight_matrix = Parameter(mindspore.Tensor(vector_dim, matrix_dim))
-        self._bias = Parameter(mindspore.Tensor(1))
-        self._activation = activation
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self._weight_matrix)
-        nn.init.zeros_(self._bias)
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        intermediate = vector.mm(self._weight_matrix).unsqueeze(1)
-        return self._activation(intermediate.bmm(matrix.swapaxes(1, 2)).squeeze(1) + self._bias)
-
-
-class CosineAttention(Attention):
-    """
-    Computes attention between a vector and a matrix using cosine similarity.
-
-    Registered as an `Attention` with name "cosine".
-    """
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        a_norm = vector / (
-            vector.norm(p=2, dim=-1, keepdim=True) + tiny_value_of_dtype(vector.dtype)
-        )
-        b_norm = matrix / (
-            matrix.norm(p=2, dim=-1, keepdim=True) + tiny_value_of_dtype(matrix.dtype)
-        )
-        return ops.bmm(a_norm.unsqueeze(dim=1), b_norm.swapaxes(-1, -2)).squeeze(1)
-
-
-class DotProductAttention(Attention):
-    """
-    Computes attention between a vector and a matrix using dot product.
-
-    Reference: [Attention Is All You Need (Vaswani et al, 2017)]
-    (https://api.semanticscholar.org/CorpusID:13756489)
-
-    Registered as an `Attention` with name "dot_product".
-    """
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        return matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)
-
-class LinearAttention(Attention):
-    """
-    This `Attention` module performs a dot product between a vector of weights and some
-    combination of the two input vectors, followed by an (optional) activation function.  The
-    combination used is configurable.
-
-    If the two vectors are `x` and `y`, we allow the following kinds of combinations : `x`,
-    `y`, `x*y`, `x+y`, `x-y`, `x/y`, where each of those binary operations is performed
-    elementwise.  You can list as many combinations as you want, comma separated.  For example, you
-    might give `x,y,x*y` as the `combination` parameter to this class.  The computed similarity
-    function would then be `w^T [x; y; x*y] + b`, where `w` is a vector of weights, `b` is a
-    bias parameter, and `[;]` is vector concatenation.
-
-    Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the
-    similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can
-    accomplish that with this class by using "x*y" for `combination`.
-
-    Registered as an `Attention` with name "linear".
-
-    # Parameters
-
-    tensor_1_dim : `int`, required
-        The dimension of the first tensor, `x`, described above.  This is `x.size()[-1]` - the
-        length of the vector that will go into the similarity computation.  We need this so we can
-        build weight vectors correctly.
-    tensor_2_dim : `int`, required
-        The dimension of the second tensor, `y`, described above.  This is `y.size()[-1]` - the
-        length of the vector that will go into the similarity computation.  We need this so we can
-        build weight vectors correctly.
-    combination : `str`, optional (default=`"x,y"`)
-        Described above.
-    activation : `Activation`, optional (default=`linear`)
-        An activation function applied after the `w^T * [x;y] + b` calculation.  Default is
-        linear, i.e. no activation.
-    normalize : `bool`, optional (default=`True`)
-    """
-
-    def __init__(
-        self,
-        tensor_1_dim: int,
-        tensor_2_dim: int,
-        combination: str = "x,y",
-        activation: nn.Module = None,
-        normalize: bool = True,
-    ) -> None:
-        super().__init__(normalize)
-        self._combination = combination
-        combined_dim = get_combined_dim(combination, [tensor_1_dim, tensor_2_dim])
-        self._weight_vector = Parameter(mindspore.Tensor(combined_dim))
-        self._bias = Parameter(mindspore.Tensor(1))
-        self._activation = activation
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        std = math.sqrt(6 / (self._weight_vector.shape[0] + 1))
-        nn.init.uniform_(self._weight_vector, -std, std)
-        self._bias.data.fill_(0)
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        combined_tensors = combine_tensors_and_multiply(
-            self._combination, [vector.unsqueeze(1), matrix], self._weight_vector
-        )
-        return self._activation(combined_tensors.squeeze(1) + self._bias)
-
-class ScaledDotProductAttention(DotProductAttention):
-    """
-    Computes attention between two tensors using scaled dot product.
-    # Reference: [Attention Is All You Need (Vaswani et al, 2017)]
-    # (https://api.semanticscholar.org/CorpusID:13756489)
-
-    Registered as an `Attention` with name "scaled_dot_product".
-
-    # Parameters
-
-    scaling_factor : `int`, required
-        The similarity score is scaled down by the `scaling_factor`.
-    normalize : `bool`, optional (default=`True`)
-        If true, we normalize the computed similarities with a softmax, to return a probability
-        distribution for your attention.  If false, this is just computing a similarity score.
-    """
-
-    def __init__(self, scaling_factor: Optional[int] = None, normalize: bool = True) -> None:
-        super().__init__(normalize)
-        self.scaling_factor = scaling_factor
-
-    def _forward_internal(self, vector: mindspore.Tensor, matrix: mindspore.Tensor) -> mindspore.Tensor:
-        scores = super()._forward_internal(vector, matrix)
-        scaling_factor = self.scaling_factor or matrix.size(-1)
-        scores = scores / math.sqrt(scaling_factor)
-        return scores
-
-__all__ = [
-    "ScaledDotProductAttention",
-    "DotProductAttention",
-    "LinearAttention",
-    "BilinearAttention",
-    "AdditiveAttention",
-    "CosineAttention",
-]
diff --git a/mindnlp/common/modules/crf.py b/mindnlp/common/modules/crf.py
deleted file mode 100644
index e694d5a66..000000000
--- a/mindnlp/common/modules/crf.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""crf module"""
-from typing import List, Optional
-
-import mindspore
-from mindnlp.core import nn, ops
-
-class CRF(nn.Module):
-    """Conditional random field.
-
-    This module implements a conditional random field [LMP01]_. The forward computation
-    of this class computes the log likelihood of the given sequence of tags and
-    emission score tensor. This class also has `~CRF.decode` method which finds
-    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
-
-    Args:
-        num_tags: Number of tags.
-        batch_first: Whether the first dimension corresponds to the size of a minibatch.
-
-    Attributes:
-        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
-            ``(num_tags,)``.
-        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
-            ``(num_tags,)``.
-        transitions (`~torch.nn.Parameter`): Transition score tensor of size
-            ``(num_tags, num_tags)``.
-
-
-    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
-       "Conditional random fields: Probabilistic models for segmenting and
-       labeling sequence data". *Proc. 18th International Conf. on Machine
-       Learning*. Morgan Kaufmann. pp. 282–289.
-
-    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
-    """
-
-    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
-        if num_tags <= 0:
-            raise ValueError(f'invalid number of tags: {num_tags}')
-        super().__init__()
-        self.num_tags = num_tags
-        self.batch_first = batch_first
-        self.start_transitions = nn.Parameter(ops.empty(num_tags))
-        self.end_transitions = nn.Parameter(ops.empty(num_tags))
-        self.transitions = nn.Parameter(ops.empty(num_tags, num_tags))
-
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        """Initialize the transition parameters.
-
-        The parameters will be initialized randomly from a uniform distribution
-        between -0.1 and 0.1.
-        """
-        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
-        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
-        nn.init.uniform_(self.transitions, -0.1, 0.1)
-
-    def __repr__(self) -> str:
-        return f'{self.__class__.__name__}(num_tags={self.num_tags})'
-
-    def forward(
-        self,
-        emissions: mindspore.Tensor,
-        tags: mindspore.Tensor,
-        mask: Optional[mindspore.Tensor] = None,
-        reduction: str = 'sum',
-    ) -> mindspore.Tensor:
-        """Compute the conditional log likelihood of a sequence of tags given emission scores.
-
-        Args:
-            emissions (`~mindspore.Tensor`): Emission score tensor of size
-                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
-                ``(batch_size, seq_length, num_tags)`` otherwise.
-            tags (`~mindspore.Tensor`): Sequence of tags tensor of size
-                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
-                ``(batch_size, seq_length)`` otherwise.
-            mask (`~mindspore.Tensor`): Mask tensor of size ``(seq_length, batch_size)``
-                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
-            reduction: Specifies  the reduction to apply to the output:
-                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
-                ``sum``: the output will be summed over batches. ``mean``: the output will be
-                averaged over batches. ``token_mean``: the output will be averaged over tokens.
-
-        Returns:
-            `~mindspore.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
-            reduction is ``none``, ``()`` otherwise.
-        """
-        self._validate(emissions, tags=tags, mask=mask)
-        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
-            raise ValueError(f'invalid reduction: {reduction}')
-        if mask is None:
-            mask = ops.ones_like(tags, dtype=mindspore.bool_)
-
-        if self.batch_first:
-            emissions = emissions.swapaxes(0, 1)
-            tags = tags.swapaxes(0, 1)
-            mask = mask.swapaxes(0, 1)
-
-        # shape: (batch_size,)
-        numerator = self._compute_score(emissions, tags, mask)
-        # shape: (batch_size,)
-        denominator = self._compute_normalizer(emissions, mask)
-        # shape: (batch_size,)
-        llh = numerator - denominator
-
-        if reduction == 'none':
-            return llh
-        if reduction == 'sum':
-            return llh.sum()
-        if reduction == 'mean':
-            return llh.mean()
-        assert reduction == 'token_mean'
-        return llh.sum() / mask.type_as(emissions).sum()
-
-    def decode(self, emissions: mindspore.Tensor,
-               mask: Optional[mindspore.Tensor] = None) -> List[List[int]]:
-        """Find the most likely tag sequence using Viterbi algorithm.
-
-        Args:
-            emissions (`~mindspore.Tensor`): Emission score tensor of size
-                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
-                ``(batch_size, seq_length, num_tags)`` otherwise.
-            mask (`~mindspore.Tensor`): Mask tensor of size ``(seq_length, batch_size)``
-                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
-
-        Returns:
-            List of list containing the best tag sequence for each batch.
-        """
-        self._validate(emissions, mask=mask)
-        if mask is None:
-            mask = emissions.new_ones(emissions.shape[:2], dtype=mindspore.bool_)
-
-        if self.batch_first:
-            emissions = emissions.swapaxes(0, 1)
-            mask = mask.swapaxes(0, 1)
-
-        return self._viterbi_decode(emissions, mask)
-
-    def _validate(
-            self,
-            emissions: mindspore.Tensor,
-            tags: Optional[mindspore.Tensor] = None,
-            mask: Optional[mindspore.Tensor] = None) -> None:
-        if emissions.ndim != 3:
-            raise ValueError(f'emissions must have dimension of 3, got {emissions.ndim}')
-        if emissions.shape[2] != self.num_tags:
-            raise ValueError(
-                f'expected last dimension of emissions is {self.num_tags}, '
-                f'got {emissions.shape[2]}')
-
-        if tags is not None:
-            if emissions.shape[:2] != tuple(tags.shape):
-                raise ValueError(
-                    'the first two dimensions of emissions and tags must match, '
-                    f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}'
-                )
-
-        if mask is not None:
-            if emissions.shape[:2] != tuple(mask.shape):
-                raise ValueError(
-                    'the first two dimensions of emissions and mask must match, '
-                    f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}'
-                )
-
-            no_empty_seq = not self.batch_first and mask[0].all()
-            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
-            if not no_empty_seq and not no_empty_seq_bf:
-                raise ValueError('mask of the first timestep must all be on')
-
-    def _compute_score(
-            self, emissions: mindspore.Tensor, tags: mindspore.Tensor,
-            mask: mindspore.Tensor) -> mindspore.Tensor:
-        # emissions: (seq_length, batch_size, num_tags)
-        # tags: (seq_length, batch_size)
-        # mask: (seq_length, batch_size)
-        assert emissions.ndim == 3 and tags.ndim == 2
-        assert emissions.shape[:2] == tags.shape
-        assert emissions.shape[2] == self.num_tags
-        assert mask.shape == tags.shape
-        assert mask[0].all()
-
-        seq_length, batch_size = tags.shape
-        mask = mask.type_as(emissions)
-
-        # Start transition score and first emission
-        # shape: (batch_size,)
-        score = self.start_transitions[tags[0]]
-        score += emissions[0, ops.arange(batch_size), tags[0]]
-
-        for i in range(1, seq_length):
-            # Transition score to next tag, only added if next timestep is valid (mask == 1)
-            # shape: (batch_size,)
-            score += self.transitions[tags[i - 1], tags[i]] * mask[i]
-
-            # Emission score for next tag, only added if next timestep is valid (mask == 1)
-            # shape: (batch_size,)
-            score += emissions[i, ops.arange(batch_size), tags[i]] * mask[i]
-
-        # End transition score
-        # shape: (batch_size,)
-        seq_ends = ops.sum(mask.long(), dim=0) - 1
-        # shape: (batch_size,)
-        last_tags = tags[seq_ends, ops.arange(batch_size)]
-        # shape: (batch_size,)
-        score += self.end_transitions[last_tags]
-
-        return score
-
-    def _compute_normalizer(
-            self, emissions: mindspore.Tensor, mask: mindspore.Tensor) -> mindspore.Tensor:
-        # emissions: (seq_length, batch_size, num_tags)
-        # mask: (seq_length, batch_size)
-        assert emissions.ndim == 3 and mask.ndim == 2
-        assert emissions.shape[:2] == mask.shape
-        assert emissions.shape[2] == self.num_tags
-        assert mask[0].all()
-
-        seq_length = emissions.shape[0]
-
-        # Start transition score and first emission; score has size of
-        # (batch_size, num_tags) where for each batch, the j-th column stores
-        # the score that the first timestep has tag j
-        # shape: (batch_size, num_tags)
-        score = self.start_transitions + emissions[0]
-
-        for i in range(1, seq_length):
-            # Broadcast score for every possible next tag
-            # shape: (batch_size, num_tags, 1)
-            broadcast_score = score.unsqueeze(2)
-
-            # Broadcast emission score for every possible current tag
-            # shape: (batch_size, 1, num_tags)
-            broadcast_emissions = emissions[i].unsqueeze(1)
-
-            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
-            # for each sample, entry at row i and column j stores the sum of scores of all
-            # possible tag sequences so far that end with transitioning from tag i to tag j
-            # and emitting
-            # shape: (batch_size, num_tags, num_tags)
-            next_score = broadcast_score + self.transitions + broadcast_emissions
-
-            # Sum over all possible current tags, but we're in score space, so a sum
-            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
-            # all possible tag sequences so far, that end in tag i
-            # shape: (batch_size, num_tags)
-            next_score = ops.logsumexp(next_score, dim=1)
-
-            # Set score to the next score if this timestep is valid (mask == 1)
-            # shape: (batch_size, num_tags)
-            score = ops.where(mask[i].unsqueeze(1), next_score, score)
-
-        # End transition score
-        # shape: (batch_size, num_tags)
-        score += self.end_transitions
-
-        # Sum (log-sum-exp) over all possible tags
-        # shape: (batch_size,)
-        return ops.logsumexp(score, dim=1)
-
-    def _viterbi_decode(self, emissions: mindspore.Tensor,
-                        mask: mindspore.Tensor) -> List[List[int]]:
-        # emissions: (seq_length, batch_size, num_tags)
-        # mask: (seq_length, batch_size)
-        assert emissions.ndim == 3 and mask.ndim == 2
-        assert emissions.shape[:2] == mask.shape
-        assert emissions.shape[2] == self.num_tags
-        assert mask[0].all()
-
-        seq_length, batch_size = mask.shape
-
-        # Start transition and first emission
-        # shape: (batch_size, num_tags)
-        score = self.start_transitions + emissions[0]
-        history: List[mindspore.Tensor] = []
-
-        # score is a tensor of size (batch_size, num_tags) where for every batch,
-        # value at column j stores the score of the best tag sequence so far that ends
-        # with tag j
-        # history saves where the best tags candidate transitioned from; this is used
-        # when we trace back the best tag sequence
-
-        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
-        # for every possible next tag
-        for i in range(1, seq_length):
-            # Broadcast viterbi score for every possible next tag
-            # shape: (batch_size, num_tags, 1)
-            broadcast_score = score.unsqueeze(2)
-
-            # Broadcast emission score for every possible current tag
-            # shape: (batch_size, 1, num_tags)
-            broadcast_emission = emissions[i].unsqueeze(1)
-
-            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
-            # for each sample, entry at row i and column j stores the score of the best
-            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
-            # shape: (batch_size, num_tags, num_tags)
-            next_score = broadcast_score + self.transitions + broadcast_emission
-
-            # Find the maximum score over all possible current tag
-            # shape: (batch_size, num_tags)
-            next_score, indices = ops.max(next_score, dim=1)
-
-            # Set score to the next score if this timestep is valid (mask == 1)
-            # and save the index that produces the next score
-            # shape: (batch_size, num_tags)
-            score = ops.where(mask[i].unsqueeze(1), next_score, score)
-            history.append(indices)
-
-        # End transition score
-        # shape: (batch_size, num_tags)
-        score += self.end_transitions
-
-        # Now, compute the best path for each sample
-
-        # shape: (batch_size,)
-        seq_ends = ops.sum(mask.long(), dim=0) - 1
-        best_tags_list: List[List[int]] = []
-
-        for idx in range(batch_size):
-            # Find the tag which maximizes the score at the last timestep; this is our best tag
-            # for the last timestep
-            _, best_last_tag = ops.max(score[idx], dim=0)
-            best_tags: List[int] = []
-            best_tags.append(best_last_tag.item())
-
-            # We trace back where the best last tag comes from, append that to our best tag
-            # sequence, and trace it back again, and so on
-            # NOTE: reversed() cannot be used here because it is not supported by TorchScript,
-            # see https://github.com/pytorch/pytorch/issues/31772.
-            for hist in history[:seq_ends[idx]][::-1]:
-                best_last_tag = hist[idx][best_tags[-1]]
-                best_tags.append(best_last_tag.item())
-
-            # Reverse the order because we start from the last timestep
-            best_tags.reverse()
-            best_tags_list.append(best_tags)
-
-        return best_tags_list
-
-__all__ = ["CRF"]
diff --git a/mindnlp/common/modules/loss.py b/mindnlp/common/modules/loss.py
deleted file mode 100644
index ea8a163f1..000000000
--- a/mindnlp/common/modules/loss.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Losses"""
-import numpy as np
-import mindspore
-from mindspore import Tensor
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-def _inner_log_softmax(inputs, axis):
-    """inner implementation of log_softmax, since the LogSoftmaxGrad op do not support inputs > 2d"""
-    return inputs - ops.logsumexp(inputs, axis, True)
-
-def sequence_mask(lengths, maxlen):
-    """generate mask matrix by seq_length"""
-    length_dtype = lengths.dtype
-    range_vector = Tensor(np.arange(0, maxlen, 1), length_dtype)
-    result = range_vector < lengths.view(lengths.shape + (1,))
-    return result.astype(mindspore.float32)
-
-
-class RDropLoss(nn.Module):
-    """
-    R-Drop Loss implementation
-    For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448
-
-    Original implementation please refer to this code: https://github.com/dropreg/R-Drop
-
-    Args:
-        reduction(str): Indicate how to average the loss, the candicates are "none",
-            "batchmean","mean","sum". Default: "none".
-
-            - "mean": The reduced mean loss is returned.
-            - "batchmean": The sum loss divided by batch size is returned.
-            - "sum": The reduced sum loss is returned.
-            - "none": No reduction will be applied.
-    """
-    def __init__(self, reduction='none'):
-        r"""
-        Initialize an instance of the RDropLoss class.
-        
-        Args:
-            self: The instance of the class.
-            reduction (str): Specifies the type of reduction to be applied during loss calculation. 
-                It must be one of the following values: 'sum', 'mean', 'none', 'batchmean'.
-                
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If the specified 'reduction' parameter is not one of the allowed values 
-                ('sum', 'mean', 'none', 'batchmean'), a ValueError is raised with a corresponding error message.
-        """
-        super().__init__()
-        if reduction not in ['sum', 'mean', 'none', 'batchmean']:
-            raise ValueError(
-                f"'reduction' in 'RDropLoss' should be 'sum', 'mean' 'batchmean', or 'none', "
-                f"but received {reduction}.")
-        self.reduction = reduction
-
-    def forward(self, p, q, pad_mask=None):
-        """
-        Returns loss tensor, the rdrop loss of p and q.
-
-        Args:
-            p (Tensor): The first forward logits of training examples.
-            q (Tensor): The second forward logits of training examples.
-            pad_mask(Tensor): The Tensor containing the binary mask to index with,
-                it's data type is bool. Default: None.
-
-        Returns:
-            Tensor, the rdrop loss of p and q.
-
-        Raises:
-            ValueError: if 'reduction' in 'RDropLoss' is not 'sum', 'mean' 'batchmean', or 'none'.
-
-        Examples:
-            >>> r_drop_loss = RDropLoss()
-            >>> p = Tensor(np.array([1., 0. , 1.]), mindspore.float32)
-            >>> q = Tensor(np.array([0.2, 0.3 , 1.1]), mindspore.float32)
-            >>> loss = r_drop_loss(p, q)
-            >>> print(loss)
-            0.100136
-        """
-        p_loss = F.kl_div(_inner_log_softmax(p, axis=-1),
-                        F.softmax(q, dim=-1),
-                        reduction=self.reduction)
-        q_loss = F.kl_div(_inner_log_softmax(q, axis=-1),
-                        F.softmax(p, dim=-1),
-                        reduction=self.reduction)
-
-        # pad_mask is for seq-level tasks
-        if pad_mask is not None:
-            p_loss = ops.masked_select(p_loss, pad_mask)
-            q_loss = ops.masked_select(q_loss, pad_mask)
-
-        # You can choose whether to use function "sum" and "mean" depending on your task
-        p_loss = p_loss.sum()
-        q_loss = q_loss.sum()
-        loss = (p_loss + q_loss) / 2
-        return loss
-
-
-class CMRC2018Loss(nn.Module):
-    r"""
-    CMRC2018Loss
-    used to compute CMRC2018 chinese Q&A task
-
-    Args:
-        reduction(str): Indicate how to average the loss, the candicates are "mean" and "sum".
-            Default: "mean".
-    """
-    def __init__(self, reduction='mean'):
-        r"""
-        Initializes an instance of the CMRC2018Loss class.
-        
-        Args:
-            self: The instance of the CMRC2018Loss class.
-            reduction (str): Specifies the type of reduction to apply to the loss. Valid options are 'mean' and 'sum'.
-                The default value is 'mean'.
-                
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            AssertionError: If the reduction parameter is not one of the valid options ('mean' or 'sum').
-        
-        """
-        super().__init__()
-
-        assert reduction in ('mean', 'sum')
-
-        self.reduction = reduction
-
-    def forward(self, target_start, target_end, context_len, pred_start, pred_end):
-        """
-        compute CMRC2018Loss
-
-        Args:
-            target_start (Tensor): size: batch_size, dtype: int.
-            target_end (Tensor): size: batch_size, dtype: int.
-            context_len (Tensor): size: batch_size, dtype: float.
-            pred_start (Tensor): size: batch_size*max_len, dtype: float.
-            pred_end (Tensor): size: batch_size*max_len, dtype: float.
-
-        Returns:
-            Tensor, the CMRC2018 loss.
-
-        Raises:
-            ValueError: if 'reduction' is not 'sum' or 'mean'.
-
-        Example:
-            >>> cmrc_loss = CMRC2018Loss()
-            >>> tensor_a = mindspore.Tensor(np.array([1, 2, 1]), mindspore.int32)
-            >>> tensor_b = mindspore.Tensor(np.array([2, 1, 2]), mindspore.int32)
-            >>> my_context_len = mindspore.Tensor(np.array([2., 1., 2.]), mindspore.float32)
-            >>> tensor_c = mindspore.Tensor(np.array([
-            >>>     [0.1, 0.2, 0.1],
-            >>>     [0.1, 0.2, 0.1],
-            >>>     [0.1, 0.2, 0.1]
-            >>> ]), mindspore.float32)
-            >>> tensor_d = mindspore.Tensor(np.array([
-            >>>     [0.2, 0.1, 0.2],
-            >>>     [0.2, 0.1, 0.2],
-            >>>     [0.2, 0.1, 0.2]
-            >>> ]), mindspore.float32)
-            >>> my_loss = cmrc_loss(tensor_a, tensor_b, my_context_len, tensor_c, tensor_d)
-            >>> print(my_loss)
-        """
-        batch_size, max_len = pred_end.shape
-
-        zero_tensor = mindspore.Tensor(
-            np.zeros((batch_size, max_len)), mindspore.float32)
-
-        mask = ops.equal(sequence_mask(context_len, max_len), zero_tensor)
-
-        pred_start = pred_start.masked_fill(mask, -1e10)
-        pred_end = pred_end.masked_fill(mask, -1e10)
-
-        start_loss = F.cross_entropy(pred_start, target_start, reduction='sum')
-        end_loss = F.cross_entropy(pred_end, target_end, reduction='sum')
-
-        loss = start_loss + end_loss
-
-        if self.reduction == 'mean':
-            loss = loss / batch_size
-
-        return loss / 2
-
-
-__all__ = [
-    'RDropLoss',
-    'CMRC2018Loss'
-]
diff --git a/mindnlp/common/modules/utils.py b/mindnlp/common/modules/utils.py
deleted file mode 100644
index 2d32881a4..000000000
--- a/mindnlp/common/modules/utils.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""module utils"""
-from typing import List, Sequence, TypeVar
-
-import mindspore
-from mindspore._c_expression import typing # pylint: disable=no-name-in-module, import-error
-from mindnlp.core import ops, nn
-
-T = TypeVar("T")
-
-def _rindex(sequence: Sequence[T], obj: T) -> int:
-    """
-    Return zero-based index in the sequence of the last item whose value is equal to obj.  Raises a
-    ValueError if there is no such item.
-
-    # Parameters
-
-    sequence : `Sequence[T]`
-    obj : `T`
-
-    # Returns
-
-    `int`
-        zero-based index associated to the position of the last item equal to obj
-    """
-    for i in range(len(sequence) - 1, -1, -1):
-        if sequence[i] == obj:
-            return i
-
-    raise ValueError(f"Unable to find {obj} in sequence {sequence}.")
-
-def info_value_of_dtype(dtype):
-    """
-    Returns the `finfo` or `iinfo` object of a given PyTorch data type. Does not allow torch.bool.
-    """
-    if dtype == mindspore.bool_:
-        raise TypeError("Does not support torch.bool")
-    elif isinstance(dtype, typing.Float):
-        return ops.finfo(dtype)
-    else:
-        return ops.iinfo(dtype)
-
-
-def min_value_of_dtype(dtype):
-    """
-    Returns the minimum value of a given PyTorch data type. Does not allow torch.bool.
-    """
-    return float(info_value_of_dtype(dtype).min)
-
-
-def max_value_of_dtype(dtype):
-    """
-    Returns the maximum value of a given PyTorch data type. Does not allow torch.bool.
-    """
-    return float(info_value_of_dtype(dtype).max)
-
-
-def tiny_value_of_dtype(dtype):
-    """
-    Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical
-    issues such as division by zero.
-    This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs.
-    Only supports floating point dtypes.
-    """
-    if not dtype.is_floating_point:
-        raise TypeError("Only supports floating point dtypes.")
-    if dtype in (mindspore.float32, mindspore.float64):
-        return 1e-13
-    elif dtype == mindspore.float16:
-        return 1e-4
-    else:
-        raise TypeError("Does not support dtype " + str(dtype))
-
-def masked_softmax(
-    vector: mindspore.Tensor,
-    mask: mindspore.Tensor,
-    dim: int = -1,
-    memory_efficient: bool = False,
-) -> mindspore.Tensor:
-    """
-    `torch.nn.functional.softmax(vector)` does not work if some elements of `vector` should be
-    masked.  This performs a softmax on just the non-masked portions of `vector`.  Passing
-    `None` in for the mask is also acceptable; you'll just get a regular softmax.
-
-    `vector` can have an arbitrary number of dimensions; the only requirement is that `mask` is
-    broadcastable to `vector's` shape.  If `mask` has fewer dimensions than `vector`, we will
-    unsqueeze on dimension 1 until they match.  If you need a different unsqueezing of your mask,
-    do it yourself before passing the mask into this function.
-
-    If `memory_efficient` is set to true, we will simply use a very large negative number for those
-    masked positions so that the probabilities of those positions would be approximately 0.
-    This is not accurate in math, but works for most cases and consumes less memory.
-
-    In the case that the input vector is completely masked and `memory_efficient` is false, this function
-    returns an array of `0.0`. This behavior may cause `NaN` if this is used as the last layer of
-    a model that uses categorical cross-entropy loss. Instead, if `memory_efficient` is true, this function
-    will treat every element as equal, and do softmax over equal numbers.
-    """
-    if mask is None:
-        result = nn.functional.softmax(vector, dim=dim)
-    else:
-        while mask.ndim < vector.ndim:
-            mask = mask.unsqueeze(1)
-        if not memory_efficient:
-            # To limit numerical errors from large vector elements outside the mask, we zero these out.
-            result = nn.functional.softmax(vector * mask, dim=dim)
-            result = result * mask
-            result = result / (
-                result.sum(dim=dim, keepdim=True) + tiny_value_of_dtype(result.dtype)
-            )
-        else:
-            masked_vector = vector.masked_fill(~mask, min_value_of_dtype(vector.dtype))
-            result = nn.functional.softmax(masked_vector, dim=dim)
-    return result
-
-def _get_combination(combination: str, tensors: List[mindspore.Tensor]) -> mindspore.Tensor:
-    if combination.isdigit():
-        index = int(combination) - 1
-        return tensors[index]
-    else:
-        if len(combination) != 3:
-            raise ValueError("Invalid combination: " + combination)
-        first_tensor = _get_combination(combination[0], tensors)
-        second_tensor = _get_combination(combination[2], tensors)
-        operation = combination[1]
-        if operation == "*":
-            return first_tensor * second_tensor
-        elif operation == "/":
-            return first_tensor / second_tensor
-        elif operation == "+":
-            return first_tensor + second_tensor
-        elif operation == "-":
-            return first_tensor - second_tensor
-        else:
-            raise ValueError("Invalid operation: " + operation)
-
-
-def _get_combination_and_multiply(
-    combination: str, tensors: List[mindspore.Tensor], weight: nn.Parameter
-) -> mindspore.Tensor:
-    if combination.isdigit():
-        index = int(combination) - 1
-        return ops.matmul(tensors[index], weight)
-    else:
-        if len(combination) != 3:
-            raise ValueError("Invalid combination: " + combination)
-        first_tensor = _get_combination(combination[0], tensors)
-        second_tensor = _get_combination(combination[2], tensors)
-        operation = combination[1]
-        if operation == "*":
-            if first_tensor.ndim > 4 or second_tensor.ndim > 4:
-                raise ValueError("Tensors with dim > 4 not currently supported")
-            desired_dim = max(first_tensor.ndim, second_tensor.ndim) - 1
-            if first_tensor.ndim == 4:
-                expanded_dim = _rindex(first_tensor.size(), 1)
-                first_tensor = first_tensor.squeeze(expanded_dim)
-            if second_tensor.ndim == 4:
-                expanded_dim = _rindex(second_tensor.size(), 1)
-                second_tensor = second_tensor.squeeze(expanded_dim)
-            intermediate = first_tensor * weight
-            result = ops.matmul(intermediate, second_tensor.swapaxes(-1, -2))
-            if result.ndim == desired_dim + 1:
-                result = result.squeeze(-1)
-            return result
-        elif operation == "/":
-            if first_tensor.ndim > 4 or second_tensor.ndim > 4:
-                raise ValueError("Tensors with dim > 4 not currently supported")
-            desired_dim = max(first_tensor.ndim, second_tensor.ndim) - 1
-            if first_tensor.ndim == 4:
-                expanded_dim = _rindex(first_tensor.size(), 1)
-                first_tensor = first_tensor.squeeze(expanded_dim)
-            if second_tensor.ndim == 4:
-                expanded_dim = _rindex(second_tensor.size(), 1)
-                second_tensor = second_tensor.squeeze(expanded_dim)
-            intermediate = first_tensor * weight
-            result = ops.matmul(intermediate, second_tensor.pow(-1).swapaxes(-1, -2))
-            if result.ndim == desired_dim + 1:
-                result = result.squeeze(-1)
-            return result
-        elif operation == "+":
-            return ops.matmul(first_tensor, weight) + ops.matmul(second_tensor, weight)
-        elif operation == "-":
-            return ops.matmul(first_tensor, weight) - ops.matmul(second_tensor, weight)
-        else:
-            raise ValueError("Invalid operation: " + operation)
-
-def get_combined_dim(combination: str, tensor_dims: List[int]) -> int:
-    """
-    For use with [`combine_tensors`](./util.md#combine_tensors).
-    This function computes the resultant dimension when calling `combine_tensors(combination, tensors)`,
-    when the tensor dimension is known.  This is necessary for knowing the sizes of weight matrices
-    when building models that use `combine_tensors`.
-
-    # Parameters
-
-    combination : `str`
-        A comma-separated list of combination pieces, like `"1,2,1*2"`, specified identically to
-        `combination` in `combine_tensors`.
-    tensor_dims : `List[int]`
-        A list of tensor dimensions, where each dimension is from the `last axis` of the tensors
-        that will be input to `combine_tensors`.
-    """
-    if len(tensor_dims) > 9:
-        raise ValueError("Double-digit tensor lists not currently supported")
-    combination = combination.replace("x", "1").replace("y", "2")
-    return sum(_get_combination_dim(piece, tensor_dims) for piece in combination.split(","))
-
-
-def _get_combination_dim(combination: str, tensor_dims: List[int]) -> int:
-    if combination.isdigit():
-        index = int(combination) - 1
-        return tensor_dims[index]
-    else:
-        if len(combination) != 3:
-            raise ValueError("Invalid combination: " + combination)
-        first_tensor_dim = _get_combination_dim(combination[0], tensor_dims)
-        second_tensor_dim = _get_combination_dim(combination[2], tensor_dims)
-        operation = combination[1]
-        if first_tensor_dim != second_tensor_dim:
-            raise ValueError('Tensor dims must match for operation "{}"'.format(operation))
-        return first_tensor_dim
-
-def combine_tensors_and_multiply(
-    combination: str, tensors: List[mindspore.Tensor], weights: nn.Parameter
-) -> mindspore.Tensor:
-    """
-    Like [`combine_tensors`](./util.md#combine_tensors), but does a weighted (linear)
-    multiplication while combining. This is a separate function from `combine_tensors`
-    because we try to avoid instantiating large intermediate tensors during the combination,
-    which is possible because we know that we're going to be multiplying by a weight vector in the end.
-
-    # Parameters
-
-    combination : `str`
-        Same as in `combine_tensors`
-    tensors : `List[mindspore.Tensor]`
-        A list of tensors to combine, where the integers in the `combination` are (1-indexed)
-        positions in this list of tensors.  These tensors are all expected to have either three or
-        four dimensions, with the final dimension being an embedding.  If there are four
-        dimensions, one of them must have length 1.
-    weights : `torch.nn.Parameter`
-        A vector of weights to use for the combinations.  This should have shape (combined_dim,),
-        as calculated by `get_combined_dim`.
-    """
-    if len(tensors) > 9:
-        raise ValueError("Double-digit tensor lists not currently supported")
-    combination = combination.replace("x", "1").replace("y", "2")
-    pieces = combination.split(",")
-    tensor_dims = [tensor.size(-1) for tensor in tensors]
-    combination_dims = [_get_combination_dim(piece, tensor_dims) for piece in pieces]
-    dims_so_far = 0
-    to_sum = []
-    for piece, combination_dim in zip(pieces, combination_dims):
-        weight = weights[dims_so_far : (dims_so_far + combination_dim)]
-        dims_so_far += combination_dim
-        to_sum.append(_get_combination_and_multiply(piece, tensors, weight))
-    result = to_sum[0]
-    for result_piece in to_sum[1:]:
-        result = result + result_piece
-    return result
diff --git a/mindnlp/common/optimization.py b/mindnlp/common/optimization.py
deleted file mode 100644
index 64dfc6402..000000000
--- a/mindnlp/common/optimization.py
+++ /dev/null
@@ -1,767 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""MindSpore optimization for BERT model."""
-
-import math
-from functools import partial
-from typing import Optional, Union
-
-from mindspore import ops
-from mindnlp.core.optim import Optimizer
-from mindnlp.core.optim.lr_scheduler import LRScheduler, LambdaLR, ReduceLROnPlateau
-
-from ..engine.utils import SchedulerType
-from ..utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-class LayerWiseDummyOptimizer(Optimizer):
-    """
-    For Layer-wise optimizers such as GaLoRE optimizer, the optimization
-    step is already done through the post gradient hooks. Therefore
-    the trick is to create a dummy optimizer that can take arbitrary
-    args and kwargs and return a no-op during training.
-
-    Initial idea from @hiyouga in LLaMA-Factory:
-    https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
-    """
-    def __init__(self, *args, optimizer_dict=None, **kwargs):
-        r"""
-        __init__
-        
-        Args:
-            self (object): The instance of the class.
-            *args: Variable length argument list.
-            optimizer_dict (dict, optional): A dictionary containing optimizer settings. Defaults to None.
-            **kwargs: Arbitrary keyword arguments. Here, it is used to extract the learning rate ('lr') from the keyword arguments.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None
-        """
-        dummy_tensor = ops.randn(1, 1)
-        self.optimizer_dict = optimizer_dict
-        super().__init__([dummy_tensor], {"lr": kwargs.get("lr", 1e-03)})
-
-
-class LayerWiseDummyScheduler(LRScheduler):
-    """
-    For Layer-wise optimizers such as GaLoRE optimizer, the optimization and scheduling step
-    are already done through the post gradient hooks. Therefore
-    the trick is to create a dummy scheduler that can take arbitrary
-    args and kwargs and return a no-op during training.
-    """
-    def __init__(self, *args, **kwargs):
-        r"""
-        Initializes a new instance of the LayerWiseDummyScheduler class.
-        
-        Args:
-            self: The instance of the LayerWiseDummyScheduler class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            N/A. This method does not raise any exceptions.
-        """
-        optimizer = LayerWiseDummyOptimizer()
-        last_epoch = -1
-        super().__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        r"""
-        Get the learning rates of all parameter groups in the optimizer.
-        
-        Args:
-            self (LayerWiseDummyScheduler): The object instance.
-        
-        Returns:
-            list: A list of learning rates for each parameter group in the optimizer.
-        
-        Raises:
-            None.
-        
-        '''
-        
-        This docstring describes the 'get_lr' method in the 'LayerWiseDummyScheduler' class. The method takes one parameter, 'self', which is an instance of the 'LayerWiseDummyScheduler' class. The purpose of
-this method is to retrieve the learning rates of all parameter groups in the optimizer.
-        
-        The method returns a list, where each element represents the learning rate of a parameter group in the optimizer. The type of the return value is a list.
-        
-        No exceptions are raised by this method.
-        """
-        return [group["lr"] for group in self.optimizer.param_groups]
-
-    def _get_closed_form_lr(self):
-        r"""
-        This method _get_closed_form_lr in the class LayerWiseDummyScheduler computes the closed form learning rate (LR).
-        
-        Args:
-            self: An instance of the LayerWiseDummyScheduler class.
-        
-        Returns:
-            None. The method returns the computed base learning rates.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        return self.base_lrs
-
-
-def _get_constant_lambda(_=None):
-    r"""
-    This function returns a constant lambda value of 1.
-    
-    Args:
-        _: This parameter is not used and can be ignored.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return 1
-
-
-def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
-
-
-def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
-    """
-    Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        kwargs (`dict`, *optional*):
-            Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
-            for possible parameters.
-
-    Return:
-        `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
-    """
-    return ReduceLROnPlateau(optimizer, **kwargs)
-
-
-def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
-    r"""
-    Args:
-        current_step (int): The current step in the learning rate schedule.
-        num_warmup_steps (int): The number of warmup steps to gradually increase the learning rate.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1.0, num_warmup_steps))
-    return 1.0
-
-
-def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
-    increases linearly between 0 and the initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
-    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
-
-
-def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
-    r"""
-    Args:
-        current_step (int): The current step in the training process.
-        num_warmup_steps (int): The number of warm-up steps at the beginning of the training.
-        num_training_steps (int): The total number of training steps.
-    
-    Returns:
-        None. The function does not return a value, but it updates the learning rate schedule.
-    
-    Raises:
-        None.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
-
-
-def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
-    """
-    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
-    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    lr_lambda = partial(
-        _get_linear_schedule_with_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def _get_cosine_schedule_with_warmup_lr_lambda(
-    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
-):
-    """
-    Args:
-        current_step (int): The current step in the training process.
-        num_warmup_steps (int): The number of warmup steps before the learning rate reaches its maximum value.
-        num_training_steps (int): The total number of training steps.
-        num_cycles (float): The number of cosine cycles for the schedule.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-
-
-def get_cosine_schedule_with_warmup(
-    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    lr_lambda = partial(
-        _get_cosine_schedule_with_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
-    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
-):
-    r"""
-    Calculates the learning rate lambda value for a cosine schedule with hard restarts and warm-up.
-    
-    Args:
-        current_step (int): The current step in the training process.
-    
-    Returns:
-        float: The learning rate lambda value.
-    
-    Raises:
-        None.
-    
-    This function calculates the learning rate lambda value based on the current step in the training process. It uses a cosine schedule with hard restarts and warm-up. The learning rate lambda value is used
-to adjust the learning rate during training.
-    
-    The function takes the following parameters:
-    - current_step: The current step in the training process. It should be an integer.
-    
-    The function returns the learning rate lambda value as a float. The lambda value is used to adjust the learning rate for the current step.
-    
-    No exceptions are raised by this function.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-    if progress >= 1.0:
-        return 0.0
-    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
-
-
-def get_cosine_with_hard_restarts_schedule_with_warmup(
-    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
-    linearly between 0 and the initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`int`, *optional*, defaults to 1):
-            The number of hard restarts to use.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    lr_lambda = partial(
-        _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    lr_end: float,
-    power: float,
-    lr_init: int,
-):
-    """
-    Args:
-        current_step (int): The current step in the training process.
-                            It represents the progress of the training.
-        num_warmup_steps (int): The number of warmup steps at the beginning of training.
-                                Determines the portion of training steps used for warmup.
-        num_training_steps (int): The total number of training steps.
-                                  Represents the duration of the entire training process.
-        lr_end (float): The final learning rate value to decay towards.
-                        Specifies the target learning rate at the end of training.
-        power (float): The power factor used in the polynomial decay calculation.
-                       Influences the rate of decay of the learning rate.
-        lr_init (int): The initial learning rate value at the start of training.
-                       Represents the starting learning rate value.
-    
-    Returns:
-        None: This function does not return a value explicitly, but modifies the learning rate.
-    
-    Raises:
-        ValueError: If the current_step is negative or if the lr_init is zero.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    elif current_step > num_training_steps:
-        return lr_end / lr_init  # as LambdaLR multiplies by lr_init
-    else:
-        lr_range = lr_init - lr_end
-        decay_steps = num_training_steps - num_warmup_steps
-        pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
-        decay = lr_range * pct_remaining**power + lr_end
-        return decay / lr_init  # as LambdaLR multiplies by lr_init
-
-
-def get_polynomial_decay_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
-):
-    """
-    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
-    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        lr_end (`float`, *optional*, defaults to 1e-7):
-            The end LR.
-        power (`float`, *optional*, defaults to 1.0):
-            Power factor.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
-    implementation at
-    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-
-    """
-    lr_init = optimizer.defaults["lr"]
-    if not (lr_init > lr_end):
-        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
-
-    lr_lambda = partial(
-        _get_polynomial_decay_schedule_with_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        lr_end=lr_end,
-        power=power,
-        lr_init=lr_init,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
-    r"""
-    This function calculates the learning rate decay based on the inverse square root schedule.
-    
-    Args:
-        current_step (int): The current step in the learning process.
-    
-    Keyword Args:
-        num_warmup_steps (int): The number of warm-up steps before the learning rate starts decaying.
-        timescale (int, optional): The timescale parameter used in the decay calculation. Defaults to None.
-    
-    Returns:
-        float: The decayed learning rate value.
-    
-    Raises:
-        None.
-    
-    This function returns the decayed learning rate value based on the inverse square root schedule. If the current step is less than the number of warm-up steps, it returns the current step divided by the
-maximum of 1 and the number of warm-up steps. Otherwise, it calculates the decayed learning rate using the inverse square root formula.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    shift = timescale - num_warmup_steps
-    decay = 1.0 / math.sqrt((current_step + shift) / timescale)
-    return decay
-
-
-def get_inverse_sqrt_schedule(
-    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
-):
-    """
-    Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
-    warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        timescale (`int`, *optional*, defaults to `num_warmup_steps`):
-            Time scale.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    # Note: this implementation is adapted from
-    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
-
-    if timescale is None:
-        timescale = num_warmup_steps or 10_000
-
-    lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
-    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
-
-
-def _get_cosine_schedule_with_warmup_lr_lambda(
-    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_rate: float = 0.0
-):
-    r"""
-    This function implements a cosine learning rate schedule with warmup for a given current step. The learning rate is adjusted based on the progress of the training.
-    
-    Args:
-        current_step (int): The current step in the training process.
-    
-    Returns:
-        float: The adjusted learning rate at the current step.
-    
-    Raises:
-        None
-    
-    The function calculates the learning rate adjustment based on the number of warmup steps, training steps, number of cycles, and minimum learning rate rate. If the current step is less than the number of
-warmup steps, the learning rate is linearly increased. Otherwise, the learning rate is adjusted using a cosine function with the given number of cycles. The learning rate is then scaled by the minimum learning
-rate rate.
-    
-    The function returns the maximum of 0 and the adjusted learning rate factor, ensuring a non-negative learning rate.
-    """
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-    factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-    factor = factor * (1 - min_lr_rate) + min_lr_rate
-    return max(0, factor)
-
-
-def get_cosine_with_min_lr_schedule_with_warmup(
-    optimizer: Optimizer,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    last_epoch: int = -1,
-    min_lr: float = None,
-    min_lr_rate: float = None,
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-        min_lr (`float`, *optional*):
-            The minimum learning rate to reach after the cosine schedule.
-        min_lr_rate (`float`, *optional*):
-            The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-    if min_lr is not None and min_lr_rate is not None:
-        raise ValueError("Only one of min_lr or min_lr_rate should be set")
-    elif min_lr is not None:
-        min_lr_rate = min_lr / optimizer.defaults["lr"]
-    elif min_lr_rate is None:
-        raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
-
-    lr_lambda = partial(
-        _get_cosine_schedule_with_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-        min_lr_rate=min_lr_rate,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-TYPE_TO_SCHEDULER_FUNCTION = {
-    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
-    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
-    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
-    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
-    SchedulerType.CONSTANT: get_constant_schedule,
-    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
-    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
-    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
-    SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup,
-}
-
-
-def get_scheduler(
-    name: Union[str, SchedulerType],
-    optimizer: Optimizer,
-    num_warmup_steps: Optional[int] = None,
-    num_training_steps: Optional[int] = None,
-    scheduler_specific_kwargs: Optional[dict] = None,
-):
-    """
-    Unified API to get any scheduler from its name.
-
-    Args:
-        name (`str` or `SchedulerType`):
-            The name of the scheduler to use.
-        optimizer (`torch.optim.Optimizer`):
-            The optimizer that will be used during training.
-        num_warmup_steps (`int`, *optional*):
-            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (`int``, *optional*):
-            The number of training steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        scheduler_specific_kwargs (`dict`, *optional*):
-            Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler
-            parameters will cause the scheduler function to raise a TypeError.
-    """
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-
-    # If a `LayerWiseDummyOptimizer` is passed we extract the optimizer dict and
-    # recursively call `get_scheduler` to get the proper schedulers on each parameter
-    if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer):
-        optimizer_dict = optimizer.optimizer_dict
-        scheduler_dict = {}
-
-        for param in optimizer_dict.keys():
-            scheduler_dict[param] = get_scheduler(
-                name,
-                optimizer=optimizer_dict[param],
-                num_warmup_steps=num_warmup_steps,
-                num_training_steps=num_training_steps,
-            )
-
-        def scheduler_hook(param):
-            # Since the optimizer hook has been already attached we only need to
-            # attach the scheduler hook
-            if param.grad is not None:
-                scheduler_dict[param].step()
-
-        for param in optimizer_dict.keys():
-            if param.requires_grad:
-                param.register_post_accumulate_grad_hook(scheduler_hook)
-
-        return LayerWiseDummyScheduler()
-
-    if name == SchedulerType.CONSTANT:
-        return schedule_func(optimizer)
-
-    if scheduler_specific_kwargs is None:
-        scheduler_specific_kwargs = {}
-
-    if name == SchedulerType.REDUCE_ON_PLATEAU:
-        return schedule_func(optimizer, **scheduler_specific_kwargs)
-
-    # All other schedulers require `num_warmup_steps`
-    if num_warmup_steps is None:
-        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
-
-    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
-
-    if name == SchedulerType.INVERSE_SQRT:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
-
-    # All other schedulers require `num_training_steps`
-    if num_training_steps is None:
-        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
-
-    return schedule_func(
-        optimizer,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        **scheduler_specific_kwargs,
-    )
-
-class AdafactorSchedule(LambdaLR):
-    """
-    Since [`~optimization.Adafactor`] performs its own scheduling, if the training loop relies on a scheduler (e.g.,
-    for logging), this class creates a proxy object that retrieves the current lr values from the optimizer.
-
-    It returns `initial_lr` during startup and the actual `lr` during stepping.
-    """
-    def __init__(self, optimizer, initial_lr=0.0):
-        r"""
-        Initialize the AdafactorSchedule class.
-        
-        Args:
-            self (object): The instance of the AdafactorSchedule class.
-            optimizer (object): The optimizer to be used for updating parameters.
-            initial_lr (float, optional): The initial learning rate. Default is 0.0.
-        
-        Returns:
-            None. This method initializes the AdafactorSchedule class.
-        
-        Raises:
-            None.
-        """
-        def lr_lambda(_):
-            return initial_lr
-
-        for group in optimizer.param_groups:
-            group["initial_lr"] = initial_lr
-        super().__init__(optimizer, lr_lambda)
-        for group in optimizer.param_groups:
-            del group["initial_lr"]
-
-    def get_lr(self):
-        r"""
-        This method retrieves the learning rates for the optimizer associated with the AdafactorSchedule class.
-        
-        Args:
-            self: AdafactorSchedule - The instance of the AdafactorSchedule class.
-            
-        Returns:
-            List - A list of learning rates associated with the optimizer's parameter groups.
-        
-        Raises:
-            None
-        """
-        opt = self.optimizer
-        lrs = [
-            opt._get_lr(group, opt.state[group["params"][0]])
-            for group in opt.param_groups
-            if group["params"][0].grad is not None
-        ]
-        if len(lrs) == 0:
-            lrs = self.base_lrs  # if called before stepping
-        return lrs
-
-
-def get_adafactor_schedule(optimizer, initial_lr=0.0):
-    """
-    Get a proxy schedule for [`~optimization.Adafactor`]
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        initial_lr (`float`, *optional*, defaults to 0.0):
-            Initial lr
-
-    Return:
-        [`~optimization.Adafactor`] proxy schedule object.
-
-
-    """
-    return AdafactorSchedule(optimizer, initial_lr)
diff --git a/mindnlp/configs.py b/mindnlp/configs.py
deleted file mode 100644
index ef7bbe59b..000000000
--- a/mindnlp/configs.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Global configs
-"""
-import os
-from packaging import version
-import mindspore
-from mindspore._c_expression import MSContext # pylint: disable=no-name-in-module, import-error
-
-SOC = MSContext.get_instance().get_ascend_soc_version()
-DEVICE_TARGET = mindspore.get_context('device_target')
-GENERATOR_SEED = version.parse(mindspore.__version__) >= version.parse('2.3.0')
-SUPPORT_ASYNC_DIST_OP = version.parse(mindspore.__version__) >= version.parse('2.4.0')
-SUPPORT_VIEW = GENERATOR_SEED
-SUPPORT_BF16 = GENERATOR_SEED and '910b' in SOC
-ON_ORANGE_PI = '310b' in SOC
-USE_PYBOOST = version.parse(mindspore.__version__) >= version.parse('2.3.0') and DEVICE_TARGET == 'Ascend'
-DEFAULT_DTYPE = mindspore.float32
-
-WEIGHTS_NAME = "mindspore_model.ckpt"
-PT_WEIGHTS_NAME = "pytorch_model.bin"
-WEIGHTS_INDEX_NAME = "mindspore_model.ckpt.index.json"
-PT_WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
-SAFE_WEIGHTS_NAME = "model.safetensors"
-SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
-
-CONFIG_NAME = "config.json"
-GENERATION_CONFIG_NAME = "generation_config.json"
-TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-
-FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
-PROCESSOR_NAME = "processor_config.json"
-CHAT_TEMPLATE_NAME = "chat_template.json"
-
-ADAPTER_CONFIG_NAME = "adapter_config.json"
-ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
-ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
-
-DEFAULT_ROOT = os.path.join(os.getcwd(), ".mindnlp")
-# for modelscope models
-MS_URL_BASE = "https://modelscope.cn/api/v1/models/{}/repo?Revision={}&FilePath={}"
-# for huggingface url
-HF_ENDPOINT = os.environ.get('HF_ENDPOINT', 'https://hf-mirror.com')
-HF_URL_BASE = HF_ENDPOINT + '/{}/resolve/{}/{}?download=true'
-
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-MINDNLP_CACHE = os.getenv("MINDNLP_CACHE", DEFAULT_ROOT)
-
-REPO_TYPE_DATASET = "dataset"
-REPO_TYPE_MODEL = "model"
-REPO_TYPES = [None, REPO_TYPE_MODEL, REPO_TYPE_DATASET]
-
-# Token
-HF_TOKEN = os.environ.get('HF_TOKEN', None)
-
-# Values
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
-IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
-IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
-
-def set_pyboost(mode: bool):
-    """set global pyboost"""
-    global USE_PYBOOST
-    USE_PYBOOST = mode
-
-def use_pyboost():
-    """set global pyboost"""
-    return USE_PYBOOST
diff --git a/mindnlp/evaluate.py b/mindnlp/evaluate.py
deleted file mode 100644
index 2cec5bcb6..000000000
--- a/mindnlp/evaluate.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-evaluate module.
-"""
-from typing import Optional, Union
-from datasets import DownloadConfig, DownloadMode
-from datasets.utils.version import Version
-from evaluate import config
-from evaluate import load as eval_load
-from evaluate.module import EvaluationModule
-
-config.HUB_EVALUATE_URL = "https://openi.pcl.ac.cn/{path}/raw/branch/{revision}/{name}"
-
-def load(
-    path: str,
-    config_name: Optional[str] = None,
-    module_type: Optional[str] = None,
-    process_id: int = 0,
-    num_process: int = 1,
-    cache_dir: Optional[str] = None,
-    experiment_id: Optional[str] = None,
-    keep_in_memory: bool = False,
-    download_config: Optional[DownloadConfig] = None,
-    download_mode: Optional[DownloadMode] = None,
-    revision: Optional[Union[str, Version]] = None,
-    **init_kwargs,
-) -> EvaluationModule:
-    r"""
-    Args:
-        path (str): The path to the file or directory to be loaded.
-        config_name (str, optional): The name of the configuration to be used. Default is None.
-        module_type (str, optional): The type of module to be loaded. Default is None.
-        process_id (int): The ID of the current process. Default is 0.
-        num_process (int): The total number of processes. Default is 1.
-        cache_dir (str, optional): The directory where cached files are stored. Default is None.
-        experiment_id (str, optional): The ID of the experiment. Default is None.
-        keep_in_memory (bool): Whether to keep the loaded data in memory. Default is False.
-        download_config (DownloadConfig, optional): The download configuration. Default is None.
-        download_mode (DownloadMode, optional): The download mode. Default is None.
-        revision (str or Version, optional): The revision or version of the loaded data. Default is None.
-        **init_kwargs: Additional keyword arguments to be passed to the evaluation module.
-    
-    Returns:
-        EvaluationModule: The loaded evaluation module.
-    
-    Raises:
-        None
-    """
-    return eval_load(
-        path,
-        config_name,
-        module_type,
-        process_id,
-        num_process,
-        cache_dir,
-        experiment_id,
-        keep_in_memory,
-        download_config,
-        download_mode,
-        revision,
-        **init_kwargs,
-    )
\ No newline at end of file
diff --git a/mindnlp/integrations/__init__.py b/mindnlp/integrations/__init__.py
new file mode 100644
index 000000000..20b4fab81
--- /dev/null
+++ b/mindnlp/integrations/__init__.py
@@ -0,0 +1,3 @@
+from . import safetensors
+from . import transformers
+from . import evaluate
diff --git a/mindnlp/sentence/__init__.py b/mindnlp/integrations/evaluate.py
similarity index 82%
rename from mindnlp/sentence/__init__.py
rename to mindnlp/integrations/evaluate.py
index caf1f7ed7..a5c632d38 100644
--- a/mindnlp/sentence/__init__.py
+++ b/mindnlp/integrations/evaluate.py
@@ -12,5 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""MindNLP Text2vec"""
-from .sentence_transformer import SentenceTransformer
+"""
+evaluate module.
+"""
+from evaluate import config
+
+config.HUB_EVALUATE_URL = "https://openi.pcl.ac.cn/{path}/raw/branch/{revision}/{name}"
diff --git a/mindnlp/safetensors.py b/mindnlp/integrations/safetensors.py
similarity index 94%
rename from mindnlp/safetensors.py
rename to mindnlp/integrations/safetensors.py
index 6ef4c88df..86f2f0794 100644
--- a/mindnlp/safetensors.py
+++ b/mindnlp/integrations/safetensors.py
@@ -7,7 +7,7 @@
 
 import mindtorch
 
-from .configs import SUPPORT_BF16
+from mindtorch.configs import SUPPORT_BF16
 
 if SUPPORT_BF16:
     from mindspore.common.np_dtype import bfloat16  # pylint: disable=import-error
@@ -103,6 +103,9 @@ def start_offset(self):
     def get_shape(self):
         return self.shape
 
+    def get_dtype(self):
+        return self.info["dtype"]
+
     @property
     def shape(self):
         return self.info["shape"]
@@ -123,6 +126,11 @@ def bits(self):
     def nbytes(self):
         return self.nelements * self.bits
 
+    def __getitem__(self, slice):
+        if slice is Ellipsis:
+            return self.get()
+        return self.get()[slice]
+
 def getSize(fileobject):
     fileobject.seek(0, 2)  # move the cursor to the end of the file
     size = fileobject.tell()
@@ -195,6 +203,8 @@ def keys(self):
     def get_tensor(self, name):
         return self.tensors[name].get()
 
+    def get_slice(self, name):
+        return self.tensors[name]
 
 def safe_load_file(filename):
     """
@@ -210,7 +220,6 @@ def safe_load_file(filename):
         FileNotFoundError: If the specified file 'filename' does not exist.
         ValueError: If the data in the file is not in the correct format to create MindSpore Parameters.
     """
-    print('use patched safetensors loader')
     result = {}
     with fast_safe_open(filename, framework="np") as f:
         for k in f.keys():
@@ -238,5 +247,6 @@ def safe_save_file(tensor_dict, filename, metadata=None):
     tensor_dict = {k: v.asnumpy() for k, v in tensor_dict.items()}
     return safetensors.numpy.save_file(tensor_dict, filename, metadata)
 
+safetensors.safe_open = fast_safe_open
 from safetensors import torch
-torch.load_file = safe_load_file
\ No newline at end of file
+torch.load_file = safe_load_file
diff --git a/mindnlp/transformers.py b/mindnlp/integrations/transformers.py
similarity index 91%
rename from mindnlp/transformers.py
rename to mindnlp/integrations/transformers.py
index c344b1963..d09b85aeb 100644
--- a/mindnlp/transformers.py
+++ b/mindnlp/integrations/transformers.py
@@ -1,8 +1,8 @@
 import transformers
+
 def mock_is_not_available():
     return False
 
 transformers.utils.import_utils.is_torchvision_v2_available.__code__ = mock_is_not_available.__code__
 transformers.utils.import_utils.is_torch_flex_attn_available.__code__ = mock_is_not_available.__code__
 
-from transformers import *
diff --git a/mindnlp/mimm/__init__.py b/mindnlp/mimm/__init__.py
deleted file mode 100644
index d5ab11c80..000000000
--- a/mindnlp/mimm/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""mindspore image models"""
diff --git a/mindnlp/mimm/layers/__init__.py b/mindnlp/mimm/layers/__init__.py
deleted file mode 100644
index cc255d921..000000000
--- a/mindnlp/mimm/layers/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""mimm layers"""
-from .adaptive_avgmax_pool import \
-    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
-from .attention2d import MultiQueryAttention2d, Attention2d, MultiQueryAttentionV2
-from .attention_pool import AttentionPoolLatent
-from .attention_pool2d import AttentionPool2d, RotAttentionPool2d
-from .blur_pool import BlurPool2d, create_aa
-from .classifier import create_classifier, ClassifierHead, NormMlpClassifierHead, ClNormMlpClassifierHead
-from .cond_conv2d import CondConv2d, get_condconv_initializer
-from .conv2d_same import Conv2dSame, conv2d_same
-from .conv_bn_act import ConvNormAct, ConvNormActAa, ConvBnAct
-from .create_act import create_act_layer, get_act_layer, get_act_fn
-from .create_attn import get_attn, create_attn
-from .create_conv2d import create_conv2d
-from .create_norm import get_norm_layer, create_norm_layer
-from .create_norm_act import get_norm_act_layer, create_norm_act_layer
-from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
-from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn
-from .evo_norm import EvoNorm2dB0, EvoNorm2dB1, EvoNorm2dB2,\
-    EvoNorm2dS0, EvoNorm2dS0a, EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a
-from .filter_response_norm import FilterResponseNormTlu2d, FilterResponseNormAct2d
-from .format import Format, get_channel_dim, get_spatial_dim, nchw_to, nhwc_to
-from .gather_excite import GatherExcite
-from .global_context import GlobalContext
-from .grid import ndgrid, meshgrid
-from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible, extend_tuple
-from .hybrid_embed import HybridEmbed, HybridEmbedWithSize
-# from .inplace_abn import InplaceAbn
-from .layer_scale import LayerScale, LayerScale2d
-from .mixed_conv2d import MixedConv2d
-from .mlp import Mlp, GluMlp, GatedMlp, SwiGLU, SwiGLUPacked, ConvMlp, GlobalResponseNormMlp
-from .non_local_attn import NonLocalAttn, BatNonLocalAttn
-from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d, RmsNorm, RmsNorm2d
-from .norm_act import BatchNormAct2d, GroupNormAct, GroupNorm1Act, LayerNormAct, LayerNormAct2d,\
-    SyncBatchNormAct, convert_sync_batchnorm, FrozenBatchNormAct2d, freeze_batch_norm_2d, unfreeze_batch_norm_2d
-from .padding import get_padding, get_same_padding, pad_same
-from .patch_dropout import PatchDropout
-from .patch_embed import PatchEmbed, PatchEmbedWithSize, resample_patch_embed
-from .pool2d_same import AvgPool2dSame, create_pool2d
-from .pos_embed import resample_abs_pos_embed, resample_abs_pos_embed_nhwc
-from .pos_embed_rel import RelPosMlp, RelPosBias, RelPosBiasTf, gen_relative_position_index, gen_relative_log_coords, \
-    resize_rel_pos_bias_table, resize_rel_pos_bias_table_simple, resize_rel_pos_bias_table_levit
-from .pos_embed_sincos import pixel_freq_bands, freq_bands, build_sincos2d_pos_embed, build_fourier_pos_embed, \
-    build_rotary_pos_embed, apply_rot_embed, apply_rot_embed_cat, apply_rot_embed_list, apply_keep_indices_nlc, \
-    FourierEmbed, RotaryEmbedding, RotaryEmbeddingCat
-from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite
-from .selective_kernel import SelectiveKernel
-from .separable_conv import SeparableConv2d, SeparableConvNormAct
-from .space_to_depth import SpaceToDepth, DepthToSpace
-from .split_attn import SplitAttn
-from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
-from .std_conv import StdConv2d, StdConv2dSame, ScaledStdConv2d, ScaledStdConv2dSame
-from .test_time_pool import TestTimePoolHead, apply_test_time_pool
-from .typing import LayerType, PadType
-from .weight_init import trunc_normal_, trunc_normal_tf_, variance_scaling_, lecun_normal_, \
-    init_weight_jax, init_weight_vit
diff --git a/mindnlp/mimm/layers/adaptive_avgmax_pool.py b/mindnlp/mimm/layers/adaptive_avgmax_pool.py
deleted file mode 100644
index eb86155d8..000000000
--- a/mindnlp/mimm/layers/adaptive_avgmax_pool.py
+++ /dev/null
@@ -1,181 +0,0 @@
-""" PyTorch selectable adaptive pooling
-Adaptive pooling with the ability to select the type of pooling from:
-    * 'avg' - Average pooling
-    * 'max' - Max pooling
-    * 'avgmax' - Sum of average and max pooling re-scaled by 0.5
-    * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim
-
-Both a functional and a nn.Module version of the pooling is provided.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from typing import Tuple, Union
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .format import get_spatial_dim, get_channel_dim
-
-_int_tuple_2_t = Union[int, Tuple[int, int]]
-
-
-def adaptive_pool_feat_mult(pool_type='avg'):
-    if pool_type.endswith('catavgmax'):
-        return 2
-    else:
-        return 1
-
-
-def adaptive_avgmax_pool2d(x, output_size: _int_tuple_2_t = 1):
-    x_avg = F.adaptive_avg_pool2d(x, output_size)
-    x_max = F.adaptive_max_pool2d(x, output_size)
-    return 0.5 * (x_avg + x_max)
-
-
-def adaptive_catavgmax_pool2d(x, output_size: _int_tuple_2_t = 1):
-    x_avg = F.adaptive_avg_pool2d(x, output_size)
-    x_max = F.adaptive_max_pool2d(x, output_size)
-    return ops.cat((x_avg, x_max), 1)
-
-
-def select_adaptive_pool2d(x, pool_type='avg', output_size: _int_tuple_2_t = 1):
-    """Selectable global pooling function with dynamic input kernel size
-    """
-    if pool_type == 'avg':
-        x = F.adaptive_avg_pool2d(x, output_size)
-    elif pool_type == 'avgmax':
-        x = adaptive_avgmax_pool2d(x, output_size)
-    elif pool_type == 'catavgmax':
-        x = adaptive_catavgmax_pool2d(x, output_size)
-    elif pool_type == 'max':
-        x = F.adaptive_max_pool2d(x, output_size)
-    else:
-        assert False, 'Invalid pool type: %s' % pool_type
-    return x
-
-
-class FastAdaptiveAvgPool(nn.Module):
-    def __init__(self, flatten: bool = False, input_fmt: F = 'NCHW'):
-        super(FastAdaptiveAvgPool, self).__init__()
-        self.flatten = flatten
-        self.dim = get_spatial_dim(input_fmt)
-
-    def forward(self, x):
-        return x.mean(self.dim, keepdim=not self.flatten)
-
-
-class FastAdaptiveMaxPool(nn.Module):
-    def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveMaxPool, self).__init__()
-        self.flatten = flatten
-        self.dim = get_spatial_dim(input_fmt)
-
-    def forward(self, x):
-        return x.amax(self.dim, keepdim=not self.flatten)
-
-
-class FastAdaptiveAvgMaxPool(nn.Module):
-    def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveAvgMaxPool, self).__init__()
-        self.flatten = flatten
-        self.dim = get_spatial_dim(input_fmt)
-
-    def forward(self, x):
-        x_avg = x.mean(self.dim, keepdim=not self.flatten)
-        x_max = x.amax(self.dim, keepdim=not self.flatten)
-        return 0.5 * x_avg + 0.5 * x_max
-
-
-class FastAdaptiveCatAvgMaxPool(nn.Module):
-    def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveCatAvgMaxPool, self).__init__()
-        self.flatten = flatten
-        self.dim_reduce = get_spatial_dim(input_fmt)
-        if flatten:
-            self.dim_cat = 1
-        else:
-            self.dim_cat = get_channel_dim(input_fmt)
-
-    def forward(self, x):
-        x_avg = x.mean(self.dim_reduce, keepdim=not self.flatten)
-        x_max = x.amax(self.dim_reduce, keepdim=not self.flatten)
-        return ops.cat((x_avg, x_max), self.dim_cat)
-
-
-class AdaptiveAvgMaxPool2d(nn.Module):
-    def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveAvgMaxPool2d, self).__init__()
-        self.output_size = output_size
-
-    def forward(self, x):
-        return adaptive_avgmax_pool2d(x, self.output_size)
-
-
-class AdaptiveCatAvgMaxPool2d(nn.Module):
-    def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveCatAvgMaxPool2d, self).__init__()
-        self.output_size = output_size
-
-    def forward(self, x):
-        return adaptive_catavgmax_pool2d(x, self.output_size)
-
-
-class SelectAdaptivePool2d(nn.Module):
-    """Selectable global pooling layer with dynamic input kernel size
-    """
-    def __init__(
-            self,
-            output_size: _int_tuple_2_t = 1,
-            pool_type: str = 'fast',
-            flatten: bool = False,
-            input_fmt: str = 'NCHW',
-    ):
-        super(SelectAdaptivePool2d, self).__init__()
-        assert input_fmt in ('NCHW', 'NHWC')
-        self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
-        pool_type = pool_type.lower()
-        if not pool_type:
-            self.pool = nn.Identity()  # pass through
-            self.flatten = nn.Flatten(1) if flatten else nn.Identity()
-        elif pool_type.startswith('fast') or input_fmt != 'NCHW':
-            assert output_size == 1, 'Fast pooling and non NCHW input formats require output_size == 1.'
-            if pool_type.endswith('catavgmax'):
-                self.pool = FastAdaptiveCatAvgMaxPool(flatten, input_fmt=input_fmt)
-            elif pool_type.endswith('avgmax'):
-                self.pool = FastAdaptiveAvgMaxPool(flatten, input_fmt=input_fmt)
-            elif pool_type.endswith('max'):
-                self.pool = FastAdaptiveMaxPool(flatten, input_fmt=input_fmt)
-            elif pool_type == 'fast' or pool_type.endswith('avg'):
-                self.pool = FastAdaptiveAvgPool(flatten, input_fmt=input_fmt)
-            else:
-                assert False, 'Invalid pool type: %s' % pool_type
-            self.flatten = nn.Identity()
-        else:
-            assert input_fmt == 'NCHW'
-            if pool_type == 'avgmax':
-                self.pool = AdaptiveAvgMaxPool2d(output_size)
-            elif pool_type == 'catavgmax':
-                self.pool = AdaptiveCatAvgMaxPool2d(output_size)
-            elif pool_type == 'max':
-                self.pool = nn.AdaptiveMaxPool2d(output_size)
-            elif pool_type == 'avg':
-                self.pool = nn.AdaptiveAvgPool2d(output_size)
-            else:
-                assert False, 'Invalid pool type: %s' % pool_type
-            self.flatten = nn.Flatten(1) if flatten else nn.Identity()
-
-    def is_identity(self):
-        return not self.pool_type
-
-    def forward(self, x):
-        x = self.pool(x)
-        x = self.flatten(x)
-        return x
-
-    def feat_mult(self):
-        return adaptive_pool_feat_mult(self.pool_type)
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-               + 'pool_type=' + self.pool_type \
-               + ', flatten=' + str(self.flatten) + ')'
diff --git a/mindnlp/mimm/layers/attention2d.py b/mindnlp/mimm/layers/attention2d.py
deleted file mode 100644
index eb76d111e..000000000
--- a/mindnlp/mimm/layers/attention2d.py
+++ /dev/null
@@ -1,329 +0,0 @@
-"""attention 2d"""
-from typing import List, Optional, Union
-
-import mindspore
-from mindnlp.core import nn, ops
-
-from .create_conv2d import create_conv2d
-from .helpers import to_2tuple
-from .pool2d_same import create_pool2d
-
-
-class MultiQueryAttentionV2(nn.Module):
-    """Multi Query Attention.
-
-    Fast Transformer Decoding: One Write-Head is All You Need
-    https://arxiv.org/pdf/1911.02150.pdf
-
-    This is an acceletor optimized version - removing multiple unneccessary
-    tensor transpose by re-arranging indices according to the following rules: 1)
-    contracted indices are at the end, 2) other indices have the same order in the
-    input and output tensores.
-
-    Compared to V1, this gives 3x speed up.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            dim_out: Optional[int] = None,
-            num_heads: int = 8,
-            key_dim: int = 64,
-            value_dim: int = 64,
-            attn_drop: float = 0.,
-            proj_drop: float = 0.,
-    ):
-        """Initializer."""
-        super().__init__()
-        dim_out = dim_out or dim
-        self.num_heads = num_heads
-        self.key_dim = key_dim
-        self.value_dim = value_dim
-        self.scale = key_dim ** -0.5
-
-        self.query_proj = nn.Parameter(ops.randn([self.num_heads, self.key_dim, dim]))
-        self.key_proj = nn.Parameter(ops.randn([dim, self.key_dim]))
-        self.value_proj = nn.Parameter(ops.randn([dim, self.value_dim]))
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.out_proj = nn.Parameter(ops.randn([dim_out, self.num_heads, self.value_dim]))
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def _reshape_input(self, t):
-        """Reshapes a tensor to three dimensions, keeping the first and last."""
-        s = t.shape
-        # Propagate the shape statically where possible.
-        #num = t.shape[1:-1].numel()
-        #return t.reshape(s[0], num, s[-1])
-        return t.reshape(s[0], s[1], -1).transpose(1, 2)
-
-    def forward(self, x, m: Optional[mindspore.Tensor] = None):
-        """Run layer computation."""
-        s = x.shape
-        m = m or x
-
-        reshaped_x = self._reshape_input(x)
-        reshaped_m = self._reshape_input(m)
-
-        q = ops.einsum('bnd,hkd->bnhk', reshaped_x, self.query_proj)
-        k = ops.einsum('bmd,dk->bmk', reshaped_m, self.key_proj)
-
-        attn = ops.einsum('bnhk,bmk->bnhm', q, k)
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        v = ops.einsum('bmd,dv->bmv', reshaped_m, self.value_proj)
-        o = ops.einsum('bnhm,bmv->bnhv', attn, v)
-        result = ops.einsum('bnhv,dhv->bnd', o, self.out_proj)
-        result = self.proj_drop(result)
-        return result.reshape(s)
-
-
-class MultiQueryAttention2d(nn.Module):
-    """Multi Query Attention with spatial downsampling.
-
-     3 parameters are introduced for the spatial downsampling:
-     1. kv_stride: downsampling factor on Key and Values only.
-     2. query_strides: horizontal & vertical strides on Query only.
-
-    This is an optimized version.
-    1. Projections in Attention is explict written out as 1x1 Conv2D.
-    2. Additional reshapes are introduced to bring a up to 3x speed up.
-    """
-
-    def __init__(
-            self,
-            dim: int,
-            dim_out: Optional[int] = None,
-            num_heads: int = 8,
-            key_dim: Optional[int] = None,
-            value_dim: Optional[int] = None,
-            query_strides: int = 1,
-            kv_stride: int = 1,
-            dw_kernel_size: int = 3,
-            dilation: int = 1,
-            padding: Union[str, int, List[int]] = '',
-            attn_drop: float = 0.,
-            proj_drop: float = 0.,
-            norm_layer: nn.Module = nn.BatchNorm2d,
-            use_bias: bool = False,
-    ):
-        """Initializer.
-
-        Args:
-          num_heads: Number of attention heads.
-          key_dim: Size of the attention key dimension.
-          value_dim: Size of the attention value dimension.
-          query_strides: Vertical stride size for query only.
-          kv_stride: Key and value stride size.
-          dw_kernel_size: Spatial dimension of the depthwise kernel.
-        """
-        super().__init__()
-        dim_out = dim_out or dim
-        self.num_heads = num_heads
-        self.key_dim = key_dim or dim // num_heads
-        self.value_dim = value_dim or dim // num_heads
-        self.query_strides = to_2tuple(query_strides)
-        self.kv_stride = kv_stride
-        self.has_query_strides = any([s > 1 for s in self.query_strides])
-        self.scale = self.key_dim ** -0.5
-        self.drop = attn_drop
-
-        self.query = nn.Sequential()
-        if self.has_query_strides:
-            if padding == 'same':
-                self.query.add_module('down_pool', create_pool2d(
-                        'avg',
-                        kernel_size=self.query_strides,
-                        padding='same',
-                ))
-            else:
-                # no pad if not 'same' as kern=stride=even
-                self.query.add_module('down_pool', nn.AvgPool2d(kernel_size=query_strides))
-            self.query.add_module('norm', norm_layer(dim))
-        self.query.add_module('proj', create_conv2d(
-            dim,
-            self.num_heads * self.key_dim,
-            kernel_size=1,
-            bias=use_bias,
-        ))
-
-        self.key = nn.Sequential()
-        if kv_stride > 1:
-            self.key.add_module('down_conv', create_conv2d(
-                dim,
-                dim,
-                kernel_size=dw_kernel_size,
-                stride=kv_stride,
-                dilation=dilation,
-                padding=padding,
-                depthwise=True,
-            ))
-            self.key.add_module('norm', norm_layer(dim))
-        self.key.add_module('proj', create_conv2d(
-            dim,
-            self.key_dim,
-            kernel_size=1,
-            padding=padding,
-            bias=use_bias,
-        ))
-
-        self.value = nn.Sequential()
-        if kv_stride > 1:
-            self.value.add_module('down_conv', create_conv2d(
-                dim,
-                dim,
-                kernel_size=dw_kernel_size,
-                stride=kv_stride,
-                dilation=dilation,
-                padding=padding,
-                depthwise=True,
-            ))
-            self.value.add_module('norm', norm_layer(dim))
-        self.value.add_module('proj', create_conv2d(
-            dim,
-            self.value_dim,
-            kernel_size=1,
-            bias=use_bias,
-        ))
-
-        self.attn_drop = nn.Dropout(attn_drop)
-
-        self.output = nn.Sequential()
-        if self.has_query_strides:
-            self.output.add_module('upsample', nn.Upsample(scale_factor=self.query_strides, mode='bilinear', align_corners=False))
-        self.output.add_module('proj', create_conv2d(
-            self.value_dim * self.num_heads,
-            dim_out,
-            kernel_size=1,
-            bias=use_bias,
-        ))
-        self.output.add_module('drop',  nn.Dropout(proj_drop))
-
-        self.einsum = False
-
-    def init_weights(self):
-        # using xavier appeared to improve stability for mobilenetv4 hybrid w/ this layer
-        nn.init.xavier_uniform_(self.query.proj.weight)
-        nn.init.xavier_uniform_(self.key.proj.weight)
-        nn.init.xavier_uniform_(self.value.proj.weight)
-        if self.kv_stride > 1:
-            nn.init.xavier_uniform_(self.key.down_conv.weight)
-            nn.init.xavier_uniform_(self.value.down_conv.weight)
-        nn.init.xavier_uniform_(self.output.proj.weight)
-
-    def _reshape_input(self, t: mindspore.Tensor):
-        """Reshapes a tensor to three dimensions, keeping the batch and channels."""
-        s = t.shape
-        t = t.reshape(s[0], s[1], -1).transpose(1, 2)
-        if self.einsum:
-            return t
-        else:
-            return t.unsqueeze(1).contiguous()
-
-    def _reshape_projected_query(self, t: mindspore.Tensor, num_heads: int, key_dim: int):
-        """Reshapes projected query: [b, n, n, h x k] -> [b, n x n, h, k]."""
-        s = t.shape
-        t = t.reshape(s[0], num_heads, key_dim, -1)
-        if self.einsum:
-            return t.permute(0, 3, 1, 2).contiguous()
-        else:
-            return t.transpose(-1, -2).contiguous()
-
-    def _reshape_output(self, t: mindspore.Tensor, num_heads: int, h_px: int, w_px: int):
-        """Reshape output:[b, n x n x h, k] -> [b, n, n, hk]."""
-        s = t.shape
-        feat_dim = s[-1] * num_heads
-        if not self.einsum:
-            t = t.transpose(1, 2)
-        return t.reshape(s[0], h_px, w_px, feat_dim).permute(0, 3, 1, 2).contiguous()
-
-    def forward(self, x, attn_mask: Optional[mindspore.Tensor] = None):
-        """Run layer computation."""
-        B, C, H, W = s = x.shape
-
-        q = self.query(x)
-        # desired q shape: [b, h, k, n x n] - [b, l, h, k]
-        q = self._reshape_projected_query(q, self.num_heads, self.key_dim)
-
-        k = self.key(x)
-        # output shape of k: [b, k, p], p = m x m
-        k = self._reshape_input(k)
-
-        v = self.value(x)
-        # output shape of v: [ b, p, k], p = m x m
-        v = self._reshape_input(v)
-
-        # desired q shape: [b, n x n, h, k]
-        # desired k shape: [b, m x m, k]
-        # desired logits shape: [b, n x n, h, m x m]
-        if self.einsum:
-            attn = ops.einsum('blhk,bpk->blhp', q, k) * self.scale
-            if attn_mask is not None:
-                # NOTE: assumes mask is float and in correct shape
-                attn = attn + attn_mask
-            attn = attn.softmax(dim=-1)
-            attn = self.attn_drop(attn)
-            o = ops.einsum('blhp,bpk->blhk', attn, v)
-        else:
-            q = q * self.scale
-            attn = q @ k.transpose(-1, -2)
-            if attn_mask is not None:
-                # NOTE: assumes mask is float and in correct shape
-                attn = attn + attn_mask
-            attn = attn.softmax(dim=-1)
-            attn = self.attn_drop(attn)
-            o = attn @ v
-
-        # reshape o into [b, hk, n, n,]
-        o = self._reshape_output(o, self.num_heads, H // self.query_strides[0], W // self.query_strides[1])
-        x = self.output(o)
-        return x
-
-
-class Attention2d(nn.Module):
-
-    """ multi-head attention for 2D NCHW tensors"""
-    def __init__(
-            self,
-            dim: int,
-            dim_out: Optional[int] = None,
-            num_heads: int = 32,
-            bias: bool = True,
-            expand_first: bool = False,
-            head_first: bool = False,
-            attn_drop: float = 0.,
-            proj_drop: float = 0.
-    ):
-        super().__init__()
-        dim_out = dim_out or dim
-        dim_attn = dim_out if expand_first else dim
-        self.num_heads = num_heads
-        self.dim_head = dim_attn // num_heads
-        self.head_first = head_first
-        self.scale = num_heads ** -0.5
-
-        self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Conv2d(dim_attn, dim_out, 1, bias=bias)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, attn_mask: Optional[mindspore.Tensor] = None):
-        B, C, H, W = x.shape
-
-        if self.head_first:
-            q, k, v = self.qkv(x).view(B, self.num_heads, self.dim_head * 3, -1).chunk(3, dim=2)
-        else:
-            q, k, v = self.qkv(x).reshape(B, 3, self.num_heads, self.dim_head, -1).unbind(1)
-
-        q = q * self.scale
-        attn = q.transpose(-2, -1) @ k
-        if attn_mask is not None:
-            # NOTE: assumes mask is float and in correct shape
-            attn = attn + attn_mask
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-        x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W)
-
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
diff --git a/mindnlp/mimm/layers/attention_pool.py b/mindnlp/mimm/layers/attention_pool.py
deleted file mode 100644
index 5cf9f5b17..000000000
--- a/mindnlp/mimm/layers/attention_pool.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""attention pool"""
-from typing import Optional
-
-from mindnlp.core import nn, ops
-
-from .mlp import Mlp
-from .weight_init import trunc_normal_tf_
-
-
-class AttentionPoolLatent(nn.Module):
-    """ Attention pooling w/ latent query
-    """
-
-    def __init__(
-            self,
-            in_features: int,
-            out_features: int = None,
-            embed_dim: int = None,
-            num_heads: int = 8,
-            feat_size: Optional[int] = None,
-            mlp_ratio: float = 4.0,
-            qkv_bias: bool = True,
-            qk_norm: bool = False,
-            latent_len: int = 1,
-            latent_dim: int = None,
-            pos_embed: str = '',
-            pool_type: str = 'token',
-            norm_layer: Optional[nn.Module] = None,
-            drop: float = 0.0,
-    ):
-        super().__init__()
-        embed_dim = embed_dim or in_features
-        out_features = out_features or in_features
-        assert embed_dim % num_heads == 0
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-        self.feat_size = feat_size
-        self.scale = self.head_dim ** -0.5
-        self.pool = pool_type
-
-        if pos_embed == 'abs':
-            assert feat_size is not None
-            self.pos_embed = nn.Parameter(ops.zeros(feat_size, in_features))
-        else:
-            self.pos_embed = None
-
-        self.latent_dim = latent_dim or embed_dim
-        self.latent_len = latent_len
-        self.latent = nn.Parameter(ops.zeros(1, self.latent_len, embed_dim))
-
-        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
-        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
-        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.proj = nn.Linear(embed_dim, embed_dim)
-        self.proj_drop = nn.Dropout(drop)
-
-        self.norm = norm_layer(out_features) if norm_layer is not None else nn.Identity()
-        self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio))
-
-        self.init_weights()
-
-    def init_weights(self):
-        if self.pos_embed is not None:
-            trunc_normal_tf_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
-        trunc_normal_tf_(self.latent, std=self.latent_dim ** -0.5)
-
-    def forward(self, x):
-        B, N, C = x.shape
-
-        if self.pos_embed is not None:
-            x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
-
-        q_latent = self.latent.expand(B, -1, -1)
-        q = self.q(q_latent).reshape(B, self.latent_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv = self.kv(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
-        k, v = kv.unbind(0)
-
-        q, k = self.q_norm(q), self.k_norm(k)
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-        attn = attn.softmax(dim=-1)
-        x = attn @ v
-        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-
-        x = x + self.mlp(self.norm(x))
-
-        # optional pool if latent seq_len > 1 and pooled output is desired
-        if self.pool == 'token':
-            x = x[:, 0]
-        elif self.pool == 'avg':
-            x = x.mean(1)
-        return x
diff --git a/mindnlp/mimm/layers/attention_pool2d.py b/mindnlp/mimm/layers/attention_pool2d.py
deleted file mode 100644
index e67da85d4..000000000
--- a/mindnlp/mimm/layers/attention_pool2d.py
+++ /dev/null
@@ -1,268 +0,0 @@
-""" Attention Pool 2D
-
-Implementations of 2D spatial feature pooling using multi-head attention instead of average pool.
-
-Based on idea in CLIP by OpenAI, licensed Apache 2.0
-https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from typing import Optional, Union, Tuple
-
-import mindspore
-from mindnlp.core import nn, ops
-
-from .helpers import to_2tuple
-from .pos_embed import resample_abs_pos_embed
-from .pos_embed_sincos import apply_rot_embed, RotaryEmbedding
-from .weight_init import trunc_normal_
-
-
-class RotAttentionPool2d(nn.Module):
-    """ Attention based 2D feature pooling w/ rotary (relative) pos embedding.
-    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
-
-    Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.
-    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
-
-    NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
-    train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
-    """
-
-    def __init__(
-            self,
-            in_features: int,
-            out_features: Optional[int] = None,
-            ref_feat_size: Union[int, Tuple[int, int]] = 7,
-            embed_dim: Optional[int] = None,
-            head_dim: Optional[int] = 64,
-            num_heads: Optional[int] = None,
-            qkv_bias: bool = True,
-            qkv_separate: bool = False,
-            pool_type: str = 'token',
-            class_token: bool = False,
-            drop_rate: float = 0.,
-    ):
-        super().__init__()
-        assert pool_type in ('', 'token')
-        self.embed_dim = embed_dim = embed_dim or in_features
-        self.in_features = in_features
-        self.out_features = out_features or in_features
-        ref_feat_size = to_2tuple(ref_feat_size)
-        if num_heads is not None:
-            assert embed_dim % num_heads == 0
-            head_dim = embed_dim // num_heads
-        else:
-            assert embed_dim % head_dim == 0
-            num_heads = embed_dim // head_dim
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.pool_type = pool_type.lower()
-        self.scale = self.head_dim ** -0.5
-
-        if class_token:
-            self.cls_token = nn.Parameter(ops.zeros(1, embed_dim))
-        else:
-            self.cls_token = None
-
-        if qkv_separate:
-            self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.qkv = None
-        else:
-            self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
-        self.drop = nn.Dropout(drop_rate)
-        self.proj = nn.Linear(embed_dim, self.out_features)
-        self.pos_embed = RotaryEmbedding(self.head_dim, in_pixels=False, ref_feat_shape=ref_feat_size)
-
-    def init_weights(self, zero_init_last: bool = False):
-        if self.qkv is None:
-            in_features = self.q.in_features
-            trunc_normal_(self.q.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.q.bias)
-            trunc_normal_(self.k.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.k.bias)
-            trunc_normal_(self.v.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.v.bias)
-        else:
-            in_features = self.qkv.in_features
-            trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.qkv.bias)
-
-    def reset(self, num_classes: Optional[int] = None, pool_type: Optional[str] = None):
-        # NOTE: this module is being used as a head, so need compatible reset()
-        if pool_type is not None:
-            assert pool_type in ('', 'token')
-            self.pool_type = pool_type
-        if num_classes is not None:
-            self.proj = nn.Linear(self.in_features, num_classes) if num_classes > 0 else nn.Identity()
-            self.out_features = num_classes if num_classes > 0 else self.embed_dim
-
-    def _pool(self, x: mindspore.Tensor, H: int, W: int) -> mindspore.Tensor:
-        if self.pool_type == 'token':
-            x = x[:, 0]
-        else:
-            # if not pooled, return spatial output without token
-            x = x[:, 1:].reshape(x.shape[0], H, W, -1).permute(0, 3, 1, 2)
-        return x
-
-    def forward(self, x, pre_logits: bool = False):
-        B, _, H, W = x.shape
-        N = H * W
-        x = x.flatten(2).transpose(1, 2)
-        if self.cls_token is None:
-            x = ops.cat([x.mean(1, keepdim=True), x], dim=1)
-        else:
-            x = ops.cat([self.cls_token.expand(x.shape[0], -1, -1), x], dim=1)
-        if self.qkv is None:
-            q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-            k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-            v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-        else:
-            x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
-            q, k, v = x.unbind(0)
-
-        rse, rce = self.pos_embed.get_embed((H, W))
-        q = ops.cat([q[:, :, :1, :], apply_rot_embed(q[:, :, 1:, :], rse, rce)], dim=2).type_as(v)
-        k = ops.cat([k[:, :, :1, :], apply_rot_embed(k[:, :, 1:, :], rse, rce)], dim=2).type_as(v)
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-        attn = attn.softmax(dim=-1)
-        x = attn @ v
-        x = x.transpose(1, 2).reshape(B, N + 1, -1)
-        x = self.drop(x)
-        if pre_logits:
-            x = self._pool(x, H, W)
-            return x
-        x = self.proj(x)
-        x = self._pool(x, H, W)
-        return x
-
-
-class AttentionPool2d(nn.Module):
-    """ Attention based 2D feature pooling w/ learned (absolute) pos embedding.
-    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
-
-    It was based on impl in CLIP by OpenAI
-    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
-
-    NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
-    """
-
-    def __init__(
-            self,
-            in_features: int,
-            feat_size: Union[int, Tuple[int, int]] = 7,
-            out_features: Optional[int] = None,
-            embed_dim: Optional[int] = None,
-            head_dim: Optional[int] = 64,
-            num_heads: Optional[int] = None,
-            qkv_bias: bool = True,
-            qkv_separate: bool = False,
-            pool_type: str = 'token',
-            class_token: bool = False,
-            drop_rate: float = 0.,
-    ):
-        super().__init__()
-        assert pool_type in ('', 'token')
-        self.embed_dim = embed_dim = embed_dim or in_features
-        self.in_features = in_features
-        self.out_features = out_features or in_features
-        if num_heads is not None:
-            assert embed_dim % num_heads == 0
-            head_dim = embed_dim // num_heads
-        else:
-            assert embed_dim % head_dim == 0
-            num_heads = embed_dim // head_dim
-        self.feat_size = to_2tuple(feat_size)
-        self.seq_len = self.feat_size[0] * self.feat_size[1]
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.pool_type = pool_type
-        self.scale = self.head_dim ** -0.5
-
-        if class_token:
-            self.cls_token = nn.Parameter(ops.zeros(1, embed_dim))
-        else:
-            self.cls_token = None
-
-        if qkv_separate:
-            self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)
-            self.qkv = None
-        else:
-            self.q = self.k = self.v = None
-            self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
-        self.drop = nn.Dropout(drop_rate)
-        self.proj = nn.Linear(embed_dim, self.out_features)
-        self.pos_embed = nn.Parameter(ops.zeros(self.seq_len + 1, in_features))
-
-        self.init_weights()
-
-    def init_weights(self, zero_init_last: bool = False):
-        if self.qkv is None:
-            in_features = self.q.in_features
-            trunc_normal_(self.q.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.q.bias)
-            trunc_normal_(self.k.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.k.bias)
-            trunc_normal_(self.v.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.v.bias)
-        else:
-            in_features = self.qkv.in_features
-            trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
-            nn.init.zeros_(self.qkv.bias)
-        trunc_normal_(self.pos_embed, std=in_features ** -0.5)
-
-    def reset(self, num_classes: Optional[int] = None, pool_type: Optional[str] = None):
-        # NOTE: this module is being used as a head, so need compatible reset()
-        if pool_type is not None:
-            assert pool_type in ('', 'token')
-            self.pool_type = pool_type
-        if num_classes is not None:
-            self.proj = nn.Linear(self.in_features, num_classes) if num_classes > 0 else nn.Identity()
-            self.out_features = num_classes if num_classes > 0 else self.embed_dim
-
-    def _pool(self, x: mindspore.Tensor, H: int, W: int) -> mindspore.Tensor:
-        if self.pool_type == 'token':
-            x = x[:, 0]
-        else:
-            # if not pooled, return spatial output without token
-            x = x[:, 1:].reshape(x.shape[0], H, W, -1).permute(0, 3, 1, 2)
-        return x
-
-    def forward(self, x, pre_logits: bool = False):
-        B, _, H, W = x.shape
-        N = H * W
-        x = x.flatten(2).transpose(1, 2)
-        if self.cls_token is None:
-            x = ops.cat([x.mean(1, keepdim=True), x], dim=1)
-        else:
-            x = ops.cat([self.cls_token.expand(x.shape[0], -1, -1), x], dim=1)
-        pos_embed = resample_abs_pos_embed(self.pos_embed.unsqueeze(0), (H, W), num_prefix_tokens=1)
-        x = x + pos_embed
-
-        if self.qkv is None:
-            q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-            k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-            v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
-        else:
-            x = self.qkv(x).reshape(B, -1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
-            q, k, v = x.unbind(0)
-
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-        attn = attn.softmax(dim=-1)
-        x = attn @ v
-        x = x.transpose(1, 2).reshape(B, N + 1, -1)
-        x = self.drop(x)
-        if pre_logits:
-            x = self._pool(x, H, W)
-            return x
-        x = self.proj(x)
-        x = self._pool(x, H, W)
-        return x
diff --git a/mindnlp/mimm/layers/blur_pool.py b/mindnlp/mimm/layers/blur_pool.py
deleted file mode 100644
index 74b06bdcd..000000000
--- a/mindnlp/mimm/layers/blur_pool.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""
-BlurPool layer inspired by
- - Kornia's Max_BlurPool2d
- - Making Convolutional Networks Shift-Invariant Again :cite:`zhang2019shiftinvar`
-
-Hacked together by Chris Ha and Ross Wightman
-"""
-from functools import partial
-from typing import Optional, Type
-
-import numpy as np
-import mindspore
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .padding import get_padding
-from .typing import LayerType
-
-
-class BlurPool2d(nn.Module):
-    r"""Creates a module that computes blurs and downsample a given feature map.
-    See :cite:`zhang2019shiftinvar` for more details.
-    Corresponds to the Downsample class, which does blurring and subsampling
-
-    Args:
-        channels = Number of input channels
-        filt_size (int): binomial filter size for blurring. currently supports 3 (default) and 5.
-        stride (int): downsampling filter stride
-
-    Returns:
-        torch.Tensor: the transformed tensor.
-    """
-    def __init__(
-            self,
-            channels: Optional[int] = None,
-            filt_size: int = 3,
-            stride: int = 2,
-            pad_mode: str = 'reflect',
-    ) -> None:
-        super(BlurPool2d, self).__init__()
-        assert filt_size > 1
-        self.channels = channels
-        self.filt_size = filt_size
-        self.stride = stride
-        self.pad_mode = pad_mode
-        self.padding = [get_padding(filt_size, stride, dilation=1)] * 4
-
-        coeffs = mindspore.tensor((np.poly1d((0.5, 0.5)) ** (self.filt_size - 1)).coeffs.astype(np.float32))
-        blur_filter = (coeffs[:, None] * coeffs[None, :])[None, None, :, :]
-        if channels is not None:
-            blur_filter = blur_filter.repeat(self.channels, 1, 1, 1)
-        self.register_buffer('filt', blur_filter, persistent=False)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        x = F.pad(x, self.padding, mode=self.pad_mode)
-        if self.channels is None:
-            channels = x.shape[1]
-            weight = self.filt.expand(channels, 1, self.filt_size, self.filt_size)
-        else:
-            channels = self.channels
-            weight = self.filt
-        return F.conv2d(x, weight, stride=self.stride, groups=channels)
-
-
-def create_aa(
-        aa_layer: LayerType,
-        channels: Optional[int] = None,
-        stride: int = 2,
-        enable: bool = True,
-        noop: Optional[Type[nn.Module]] = nn.Identity
-) -> nn.Module:
-    """ Anti-aliasing """
-    if not aa_layer or not enable:
-        return noop() if noop is not None else None
-
-    if isinstance(aa_layer, str):
-        aa_layer = aa_layer.lower().replace('_', '').replace('-', '')
-        if aa_layer in ('avg', 'avgpool'):
-            aa_layer = nn.AvgPool2d
-        elif aa_layer in ('blur', 'blurpool'):
-            aa_layer = BlurPool2d
-        elif aa_layer == 'blurpc':
-            aa_layer = partial(BlurPool2d, pad_mode='constant')
-
-        else:
-            assert False, f"Unknown anti-aliasing layer ({aa_layer})."
-
-    try:
-        return aa_layer(channels=channels, stride=stride)
-    except TypeError as e:
-        return aa_layer(stride)
diff --git a/mindnlp/mimm/layers/bottleneck_attn.py b/mindnlp/mimm/layers/bottleneck_attn.py
deleted file mode 100644
index 5bc3d4633..000000000
--- a/mindnlp/mimm/layers/bottleneck_attn.py
+++ /dev/null
@@ -1,153 +0,0 @@
-""" Bottleneck Self Attention (Bottleneck Transformers)
-
-Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605
-
-@misc{2101.11605,
-Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
-Title = {Bottleneck Transformers for Visual Recognition},
-Year = {2021},
-}
-
-Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-
-This impl is a WIP but given that it is based on the ref gist likely not too far off.
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from typing import List
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .helpers import to_2tuple, make_divisible
-from .weight_init import trunc_normal_
-
-
-def rel_logits_1d(q, rel_k, permute_mask: List[int]):
-    """ Compute relative logits along one dimension
-
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    Args:
-        q: (batch, heads, height, width, dim)
-        rel_k: (2 * width - 1, dim)
-        permute_mask: permute output dim according to this
-    """
-    B, H, W, dim = q.shape
-    x = (q @ rel_k.transpose(-1, -2))
-    x = x.reshape(-1, W, 2 * W -1)
-
-    # pad to shift from relative to absolute indexing
-    x_pad = F.pad(x, [0, 1]).flatten(1)
-    x_pad = F.pad(x_pad, [0, W - 1])
-
-    # reshape and slice out the padded elements
-    x_pad = x_pad.reshape(-1, W + 1, 2 * W - 1)
-    x = x_pad[:, :W, W - 1:]
-
-    # reshape and tile
-    x = x.reshape(B, H, 1, W, W).expand(-1, -1, H, -1, -1)
-    return x.permute(permute_mask)
-
-
-class PosEmbedRel(nn.Module):
-    """ Relative Position Embedding
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-    """
-    def __init__(self, feat_size, dim_head, scale):
-        super().__init__()
-        self.height, self.width = to_2tuple(feat_size)
-        self.dim_head = dim_head
-        self.height_rel = nn.Parameter(ops.randn(self.height * 2 - 1, dim_head) * scale)
-        self.width_rel = nn.Parameter(ops.randn(self.width * 2 - 1, dim_head) * scale)
-
-    def forward(self, q):
-        B, HW, _ = q.shape
-
-        # relative logits in width dimension.
-        q = q.reshape(B, self.height, self.width, -1)
-        rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4))
-
-        # relative logits in height dimension.
-        q = q.transpose(1, 2)
-        rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2))
-
-        rel_logits = rel_logits_h + rel_logits_w
-        rel_logits = rel_logits.reshape(B, HW, HW)
-        return rel_logits
-
-
-class BottleneckAttn(nn.Module):
-    """ Bottleneck Attention
-    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605
-
-    The internal dimensions of the attention module are controlled by the interaction of several arguments.
-      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
-      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
-      * the query and key (qk) dimensions are determined by
-        * num_heads * dim_head if dim_head is not None
-        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
-      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used
-
-    Args:
-        dim (int): input dimension to the module
-        dim_out (int): output dimension of the module, same as dim if not set
-        stride (int): output stride of the module, avg pool used if stride == 2 (default: 1).
-        num_heads (int): parallel attention heads (default: 4)
-        dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
-        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
-        qkv_bias (bool): add bias to q, k, and v projections
-        scale_pos_embed (bool): scale the position embedding as well as Q @ K
-    """
-    def __init__(
-            self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, dim_head=None,
-            qk_ratio=1.0, qkv_bias=False, scale_pos_embed=False):
-        super().__init__()
-        assert feat_size is not None, 'A concrete feature size matching expected input (H, W) is required'
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0
-        self.num_heads = num_heads
-        self.dim_head_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads
-        self.dim_head_v = dim_out // self.num_heads
-        self.dim_out_qk = num_heads * self.dim_head_qk
-        self.dim_out_v = num_heads * self.dim_head_v
-        self.scale = self.dim_head_qk ** -0.5
-        self.scale_pos_embed = scale_pos_embed
-
-        self.qkv = nn.Conv2d(dim, self.dim_out_qk * 2 + self.dim_out_v, 1, bias=qkv_bias)
-
-        # NOTE I'm only supporting relative pos embedding for now
-        self.pos_embed = PosEmbedRel(feat_size, dim_head=self.dim_head_qk, scale=self.scale)
-
-        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5)  # fan-in
-        trunc_normal_(self.pos_embed.height_rel, std=self.scale)
-        trunc_normal_(self.pos_embed.width_rel, std=self.scale)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-
-        x = self.qkv(x)  # B, (2 * dim_head_qk + dim_head_v) * num_heads, H, W
-
-        # NOTE head vs channel split ordering in qkv projection was decided before I allowed qk to differ from v
-        # So, this is more verbose than if heads were before qkv splits, but throughput is not impacted.
-        q, k, v = ops.split(x, [self.dim_out_qk, self.dim_out_qk, self.dim_out_v], dim=1)
-        q = q.reshape(B * self.num_heads, self.dim_head_qk, -1).transpose(-1, -2)
-        k = k.reshape(B * self.num_heads, self.dim_head_qk, -1)  # no transpose, for q @ k
-        v = v.reshape(B * self.num_heads, self.dim_head_v, -1).transpose(-1, -2)
-
-        if self.scale_pos_embed:
-            attn = (q @ k + self.pos_embed(q)) * self.scale  # B * num_heads, H * W, H * W
-        else:
-            attn = (q @ k) * self.scale + self.pos_embed(q)
-        attn = attn.softmax(dim=-1)
-
-        out = (attn @ v).transpose(-1, -2).reshape(B, self.dim_out_v, H, W)  # B, dim_out, H, W
-        out = self.pool(out)
-        return out
diff --git a/mindnlp/mimm/layers/cbam.py b/mindnlp/mimm/layers/cbam.py
deleted file mode 100644
index 9c30c5760..000000000
--- a/mindnlp/mimm/layers/cbam.py
+++ /dev/null
@@ -1,110 +0,0 @@
-""" CBAM (sort-of) Attention
-
-Experimental impl of CBAM: Convolutional Block Attention Module: https://arxiv.org/abs/1807.06521
-
-WARNING: Results with these attention layers have been mixed. They can significantly reduce performance on
-some tasks, especially fine-grained it seems. I may end up removing this impl.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .conv_bn_act import ConvNormAct
-from .create_act import create_act_layer
-from .helpers import make_divisible
-
-
-class ChannelAttn(nn.Module):
-    """ Original CBAM channel attention module, currently avg + max pool variant only.
-    """
-    def __init__(
-            self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1,
-            act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False):
-        super(ChannelAttn, self).__init__()
-        if not rd_channels:
-            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
-        self.fc1 = nn.Conv2d(channels, rd_channels, 1, bias=mlp_bias)
-        self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(rd_channels, channels, 1, bias=mlp_bias)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_avg = self.fc2(self.act(self.fc1(x.mean((2, 3), keepdim=True))))
-        x_max = self.fc2(self.act(self.fc1(x.amax((2, 3), keepdim=True))))
-        return x * self.gate(x_avg + x_max)
-
-
-class LightChannelAttn(ChannelAttn):
-    """An experimental 'lightweight' that sums avg + max pool first
-    """
-    def __init__(
-            self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1,
-            act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False):
-        super(LightChannelAttn, self).__init__(
-            channels, rd_ratio, rd_channels, rd_divisor, act_layer, gate_layer, mlp_bias)
-
-    def forward(self, x):
-        x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * x.amax((2, 3), keepdim=True)
-        x_attn = self.fc2(self.act(self.fc1(x_pool)))
-        return x * F.sigmoid(x_attn)
-
-
-class SpatialAttn(nn.Module):
-    """ Original CBAM spatial attention module
-    """
-    def __init__(self, kernel_size=7, gate_layer='sigmoid'):
-        super(SpatialAttn, self).__init__()
-        self.conv = ConvNormAct(2, 1, kernel_size, apply_act=False)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_attn = ops.cat([x.mean(dim=1, keepdim=True), x.amax(dim=1, keepdim=True)], dim=1)
-        x_attn = self.conv(x_attn)
-        return x * self.gate(x_attn)
-
-
-class LightSpatialAttn(nn.Module):
-    """An experimental 'lightweight' variant that sums avg_pool and max_pool results.
-    """
-    def __init__(self, kernel_size=7, gate_layer='sigmoid'):
-        super(LightSpatialAttn, self).__init__()
-        self.conv = ConvNormAct(1, 1, kernel_size, apply_act=False)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_attn = 0.5 * x.mean(dim=1, keepdim=True) + 0.5 * x.amax(dim=1, keepdim=True)
-        x_attn = self.conv(x_attn)
-        return x * self.gate(x_attn)
-
-
-class CbamModule(nn.Module):
-    def __init__(
-            self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1,
-            spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False):
-        super(CbamModule, self).__init__()
-        self.channel = ChannelAttn(
-            channels, rd_ratio=rd_ratio, rd_channels=rd_channels,
-            rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias)
-        self.spatial = SpatialAttn(spatial_kernel_size, gate_layer=gate_layer)
-
-    def forward(self, x):
-        x = self.channel(x)
-        x = self.spatial(x)
-        return x
-
-
-class LightCbamModule(nn.Module):
-    def __init__(
-            self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1,
-            spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False):
-        super(LightCbamModule, self).__init__()
-        self.channel = LightChannelAttn(
-            channels, rd_ratio=rd_ratio, rd_channels=rd_channels,
-            rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias)
-        self.spatial = LightSpatialAttn(spatial_kernel_size)
-
-    def forward(self, x):
-        x = self.channel(x)
-        x = self.spatial(x)
-        return x
diff --git a/mindnlp/mimm/layers/classifier.py b/mindnlp/mimm/layers/classifier.py
deleted file mode 100644
index 5f128c894..000000000
--- a/mindnlp/mimm/layers/classifier.py
+++ /dev/null
@@ -1,282 +0,0 @@
-""" Classifier head and layer factory
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-# pylint: disable=unbalanced-tuple-unpacking
-from collections import OrderedDict
-from functools import partial
-from typing import Optional, Union, Callable
-
-from mindnlp.core import nn, no_grad
-
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
-from .create_act import get_act_layer
-from .create_norm import get_norm_layer
-
-
-def _create_pool(
-        num_features: int,
-        num_classes: int,
-        pool_type: str = 'avg',
-        use_conv: bool = False,
-        input_fmt: Optional[str] = None,
-):
-    flatten_in_pool = not use_conv  # flatten when we use a Linear layer after pooling
-    if not pool_type:
-        flatten_in_pool = False  # disable flattening if pooling is pass-through (no pooling)
-    global_pool = SelectAdaptivePool2d(
-        pool_type=pool_type,
-        flatten=flatten_in_pool,
-        input_fmt=input_fmt,
-    )
-    num_pooled_features = num_features * global_pool.feat_mult()
-    return global_pool, num_pooled_features
-
-
-def _create_fc(num_features, num_classes, use_conv=False):
-    if num_classes <= 0:
-        fc = nn.Identity()  # pass-through (no classifier)
-    elif use_conv:
-        fc = nn.Conv2d(num_features, num_classes, 1, bias=True)
-    else:
-        fc = nn.Linear(num_features, num_classes, bias=True)
-    return fc
-
-
-def create_classifier(
-        num_features: int,
-        num_classes: int,
-        pool_type: str = 'avg',
-        use_conv: bool = False,
-        input_fmt: str = 'NCHW',
-        drop_rate: Optional[float] = None,
-):
-    global_pool, num_pooled_features = _create_pool(
-        num_features,
-        num_classes,
-        pool_type,
-        use_conv=use_conv,
-        input_fmt=input_fmt,
-    )
-    fc = _create_fc(
-        num_pooled_features,
-        num_classes,
-        use_conv=use_conv,
-    )
-    if drop_rate is not None:
-        dropout = nn.Dropout(drop_rate)
-        return global_pool, dropout, fc
-    return global_pool, fc
-
-
-class ClassifierHead(nn.Module):
-    """Classifier head w/ configurable global pooling and dropout."""
-
-    def __init__(
-            self,
-            in_features: int,
-            num_classes: int,
-            pool_type: str = 'avg',
-            drop_rate: float = 0.,
-            use_conv: bool = False,
-            input_fmt: str = 'NCHW',
-    ):
-        """
-        Args:
-            in_features: The number of input features.
-            num_classes:  The number of classes for the final classifier layer (output).
-            pool_type: Global pooling type, pooling disabled if empty string ('').
-            drop_rate: Pre-classifier dropout rate.
-        """
-        super(ClassifierHead, self).__init__()
-        self.in_features = in_features
-        self.use_conv = use_conv
-        self.input_fmt = input_fmt
-
-        global_pool, fc = create_classifier(
-            in_features,
-            num_classes,
-            pool_type,
-            use_conv=use_conv,
-            input_fmt=input_fmt,
-        )
-        self.global_pool = global_pool
-        self.drop = nn.Dropout(drop_rate)
-        self.fc = fc
-        self.flatten = nn.Flatten(1) if use_conv and pool_type else nn.Identity()
-
-    def reset(self, num_classes: int, pool_type: Optional[str] = None):
-        if pool_type is not None and pool_type != self.global_pool.pool_type:
-            self.global_pool, self.fc = create_classifier(
-                self.in_features,
-                num_classes,
-                pool_type=pool_type,
-                use_conv=self.use_conv,
-                input_fmt=self.input_fmt,
-            )
-            self.flatten = nn.Flatten(1) if self.use_conv and pool_type else nn.Identity()
-        else:
-            num_pooled_features = self.in_features * self.global_pool.feat_mult()
-            self.fc = _create_fc(
-                num_pooled_features,
-                num_classes,
-                use_conv=self.use_conv,
-            )
-
-    def forward(self, x, pre_logits: bool = False):
-        x = self.global_pool(x)
-        x = self.drop(x)
-        if pre_logits:
-            return self.flatten(x)
-        x = self.fc(x)
-        return self.flatten(x)
-
-
-class NormMlpClassifierHead(nn.Module):
-    """ A Pool -> Norm -> Mlp Classifier Head for '2D' NCHW tensors
-    """
-    def __init__(
-            self,
-            in_features: int,
-            num_classes: int,
-            hidden_size: Optional[int] = None,
-            pool_type: str = 'avg',
-            drop_rate: float = 0.,
-            norm_layer: Union[str, Callable] = 'layernorm2d',
-            act_layer: Union[str, Callable] = 'tanh',
-    ):
-        """
-        Args:
-            in_features: The number of input features.
-            num_classes:  The number of classes for the final classifier layer (output).
-            hidden_size: The hidden size of the MLP (pre-logits FC layer) if not None.
-            pool_type: Global pooling type, pooling disabled if empty string ('').
-            drop_rate: Pre-classifier dropout rate.
-            norm_layer: Normalization layer type.
-            act_layer: MLP activation layer type (only used if hidden_size is not None).
-        """
-        super().__init__()
-        self.in_features = in_features
-        self.hidden_size = hidden_size
-        self.num_features = in_features
-        self.use_conv = not pool_type
-        norm_layer = get_norm_layer(norm_layer)
-        act_layer = get_act_layer(act_layer)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
-
-        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
-        self.norm = norm_layer(in_features)
-        self.flatten = nn.Flatten(1) if pool_type else nn.Identity()
-        if hidden_size:
-            self.pre_logits = nn.Sequential(OrderedDict([
-                ('fc', linear_layer(in_features, hidden_size)),
-                ('act', act_layer()),
-            ]))
-            self.num_features = hidden_size
-        else:
-            self.pre_logits = nn.Identity()
-        self.drop = nn.Dropout(drop_rate)
-        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def reset(self, num_classes: int, pool_type: Optional[str] = None):
-        if pool_type is not None:
-            self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
-            self.flatten = nn.Flatten(1) if pool_type else nn.Identity()
-        self.use_conv = self.global_pool.is_identity()
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
-        if self.hidden_size:
-            if ((isinstance(self.pre_logits.fc, nn.Conv2d) and not self.use_conv) or
-                    (isinstance(self.pre_logits.fc, nn.Linear) and self.use_conv)):
-                with no_grad():
-                    new_fc = linear_layer(self.in_features, self.hidden_size)
-                    new_fc.weight.copy_(self.pre_logits.fc.weight.reshape(new_fc.weight.shape))
-                    new_fc.bias.copy_(self.pre_logits.fc.bias)
-                    self.pre_logits.fc = new_fc
-        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward(self, x, pre_logits: bool = False):
-        x = self.global_pool(x)
-        x = self.norm(x)
-        x = self.flatten(x)
-        x = self.pre_logits(x)
-        x = self.drop(x)
-        if pre_logits:
-            return x
-        x = self.fc(x)
-        return x
-
-
-class ClNormMlpClassifierHead(nn.Module):
-    """ A Pool -> Norm -> Mlp Classifier Head for n-D NxxC tensors
-    """
-    def __init__(
-            self,
-            in_features: int,
-            num_classes: int,
-            hidden_size: Optional[int] = None,
-            pool_type: str = 'avg',
-            drop_rate: float = 0.,
-            norm_layer: Union[str, Callable] = 'layernorm',
-            act_layer: Union[str, Callable] = 'gelu',
-            input_fmt: str = 'NHWC',
-    ):
-        """
-        Args:
-            in_features: The number of input features.
-            num_classes:  The number of classes for the final classifier layer (output).
-            hidden_size: The hidden size of the MLP (pre-logits FC layer) if not None.
-            pool_type: Global pooling type, pooling disabled if empty string ('').
-            drop_rate: Pre-classifier dropout rate.
-            norm_layer: Normalization layer type.
-            act_layer: MLP activation layer type (only used if hidden_size is not None).
-        """
-        super().__init__()
-        self.in_features = in_features
-        self.hidden_size = hidden_size
-        self.num_features = in_features
-        assert pool_type in ('', 'avg', 'max', 'avgmax')
-        self.pool_type = pool_type
-        assert input_fmt in ('NHWC', 'NLC')
-        self.pool_dim = 1 if input_fmt == 'NLC' else (1, 2)
-        norm_layer = get_norm_layer(norm_layer)
-        act_layer = get_act_layer(act_layer)
-
-        self.norm = norm_layer(in_features)
-        if hidden_size:
-            self.pre_logits = nn.Sequential(OrderedDict([
-                ('fc', nn.Linear(in_features, hidden_size)),
-                ('act', act_layer()),
-            ]))
-            self.num_features = hidden_size
-        else:
-            self.pre_logits = nn.Identity()
-        self.drop = nn.Dropout(drop_rate)
-        self.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def reset(self, num_classes: int, pool_type: Optional[str] = None, reset_other: bool = False):
-        if pool_type is not None:
-            self.pool_type = pool_type
-        if reset_other:
-            self.pre_logits = nn.Identity()
-            self.norm = nn.Identity()
-        self.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def _global_pool(self, x):
-        if self.pool_type:
-            if self.pool_type == 'avg':
-                x = x.mean(dim=self.pool_dim)
-            elif self.pool_type == 'max':
-                x = x.amax(dim=self.pool_dim)
-            elif self.pool_type == 'avgmax':
-                x = 0.5 * (x.amax(dim=self.pool_dim) + x.mean(dim=self.pool_dim))
-        return x
-
-    def forward(self, x, pre_logits: bool = False):
-        x = self._global_pool(x)
-        x = self.norm(x)
-        x = self.pre_logits(x)
-        x = self.drop(x)
-        if pre_logits:
-            return x
-        x = self.fc(x)
-        return x
diff --git a/mindnlp/mimm/layers/cond_conv2d.py b/mindnlp/mimm/layers/cond_conv2d.py
deleted file mode 100644
index d1d4e94cb..000000000
--- a/mindnlp/mimm/layers/cond_conv2d.py
+++ /dev/null
@@ -1,123 +0,0 @@
-""" MindSpore Conditionally Parameterized Convolution (CondConv)
-
-Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
-(https://arxiv.org/abs/1904.04971)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import math
-from functools import partial
-import numpy as np
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .helpers import to_2tuple
-from .conv2d_same import conv2d_same
-from .padding import get_padding_value
-
-
-def get_condconv_initializer(initializer, num_experts, expert_shape):
-    def condconv_initializer(weight):
-        """CondConv initializer function."""
-        num_params = np.prod(expert_shape)
-        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
-                weight.shape[1] != num_params):
-            raise (ValueError(
-                'CondConv variables must have shape [num_experts, num_params]'))
-        for i in range(num_experts):
-            initializer(weight[i].view(expert_shape))
-    return condconv_initializer
-
-
-class CondConv2d(nn.Module):
-    """ Conditionally Parameterized Convolution
-    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
-
-    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
-    https://github.com/pytorch/pytorch/issues/17983
-    """
-    __constants__ = ['in_channels', 'out_channels', 'dynamic_padding']
-
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
-        super(CondConv2d, self).__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = to_2tuple(kernel_size)
-        self.stride = to_2tuple(stride)
-        padding_val, is_padding_dynamic = get_padding_value(
-            padding, kernel_size, stride=stride, dilation=dilation)
-        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
-        self.padding = to_2tuple(padding_val)
-        self.dilation = to_2tuple(dilation)
-        self.groups = groups
-        self.num_experts = num_experts
-
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight_num_param = 1
-        for wd in self.weight_shape:
-            weight_num_param *= wd
-        self.weight = nn.Parameter(mindspore.Tensor(self.num_experts, weight_num_param))
-
-        if bias:
-            self.bias_shape = (self.out_channels,)
-            self.bias = nn.Parameter(mindspore.Tensor(self.num_experts, self.out_channels))
-        else:
-            self.register_parameter('bias', None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init_weight = get_condconv_initializer(
-            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
-        init_weight(self.weight)
-        if self.bias is not None:
-            fan_in = np.prod(self.weight_shape[1:])
-            bound = 1 / math.sqrt(fan_in)
-            init_bias = get_condconv_initializer(
-                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
-            init_bias(self.bias)
-
-    def forward(self, x, routing_weights):
-        B, C, H, W = x.shape
-        weight = ops.matmul(routing_weights, self.weight)
-        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight = weight.view(new_weight_shape)
-        bias = None
-        if self.bias is not None:
-            bias = ops.matmul(routing_weights, self.bias)
-            bias = bias.view(B * self.out_channels)
-        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
-        # reshape instead of view to work with channels_last input
-        x = x.reshape(1, B * C, H, W)
-        if self.dynamic_padding:
-            out = conv2d_same(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        else:
-            out = F.conv2d(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
-
-        # Literal port (from TF definition)
-        # x = ops.split(x, 1, 0)
-        # weight = ops.split(weight, 1, 0)
-        # if self.bias is not None:
-        #     bias = ops.matmul(routing_weights, self.bias)
-        #     bias = ops.split(bias, 1, 0)
-        # else:
-        #     bias = [None] * B
-        # out = []
-        # for xi, wi, bi in zip(x, weight, bias):
-        #     wi = wi.view(*self.weight_shape)
-        #     if bi is not None:
-        #         bi = bi.view(*self.bias_shape)
-        #     out.append(self.conv_fn(
-        #         xi, wi, bi, stride=self.stride, padding=self.padding,
-        #         dilation=self.dilation, groups=self.groups))
-        # out = ops.cat(out, 0)
-        return out
diff --git a/mindnlp/mimm/layers/config.py b/mindnlp/mimm/layers/config.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/mimm/layers/conv2d_same.py b/mindnlp/mimm/layers/conv2d_same.py
deleted file mode 100644
index 96c06f3aa..000000000
--- a/mindnlp/mimm/layers/conv2d_same.py
+++ /dev/null
@@ -1,63 +0,0 @@
-""" Conv2d w/ Same Padding
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from typing import Tuple, Optional
-import mindspore
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .padding import pad_same, get_padding_value
-
-
-_USE_EXPORT_CONV = False
-
-
-def conv2d_same(
-        x,
-        weight: mindspore.Tensor,
-        bias: Optional[mindspore.Tensor] = None,
-        stride: Tuple[int, int] = (1, 1),
-        padding: Tuple[int, int] = (0, 0),
-        dilation: Tuple[int, int] = (1, 1),
-        groups: int = 1,
-):
-    x = pad_same(x, weight.shape[-2:], stride, dilation)
-    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
-
-
-class Conv2dSame(nn.Conv2d):
-    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
-    """
-
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=0,
-            dilation=1,
-            groups=1,
-            bias=True,
-    ):
-        super(Conv2dSame, self).__init__(
-            in_channels, out_channels, kernel_size,
-            stride, 0, dilation, groups, bias,
-        )
-
-    def forward(self, x):
-        return conv2d_same(
-            x, self.weight, self.bias,
-            self.stride, self.padding, self.dilation, self.groups,
-        )
-
-
-def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
-    padding = kwargs.pop('padding', '')
-    kwargs.setdefault('bias', False)
-    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
-    if is_dynamic:
-        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
-    else:
-        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
diff --git a/mindnlp/mimm/layers/conv_bn_act.py b/mindnlp/mimm/layers/conv_bn_act.py
deleted file mode 100644
index 34f617bba..000000000
--- a/mindnlp/mimm/layers/conv_bn_act.py
+++ /dev/null
@@ -1,91 +0,0 @@
-""" Conv2d + BN + Act
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from typing import Any, Dict, Optional, Type
-
-from mindnlp.core import nn
-
-from .typing import LayerType, PadType
-from .blur_pool import create_aa
-from .create_conv2d import create_conv2d
-from .create_norm_act import get_norm_act_layer
-
-
-class ConvNormAct(nn.Module):
-    def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            kernel_size: int = 1,
-            stride: int = 1,
-            padding: PadType = '',
-            dilation: int = 1,
-            groups: int = 1,
-            bias: bool = False,
-            apply_norm: bool = True,
-            apply_act: bool = True,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            act_layer: Optional[LayerType] = nn.ReLU,
-            aa_layer: Optional[LayerType] = None,
-            drop_layer: Optional[Type[nn.Module]] = None,
-            conv_kwargs: Optional[Dict[str, Any]] = None,
-            norm_kwargs: Optional[Dict[str, Any]] = None,
-            act_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        super(ConvNormAct, self).__init__()
-        conv_kwargs = conv_kwargs or {}
-        norm_kwargs = norm_kwargs or {}
-        act_kwargs = act_kwargs or {}
-        use_aa = aa_layer is not None and stride > 1
-
-        self.conv = create_conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1 if use_aa else stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-            **conv_kwargs,
-        )
-
-        if apply_norm:
-            # NOTE for backwards compatibility with models that use separate norm and act layer definitions
-            norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-            # NOTE for backwards (weight) compatibility, norm layer name remains `.bn`
-            if drop_layer:
-                norm_kwargs['drop_layer'] = drop_layer
-            self.bn = norm_act_layer(
-                out_channels,
-                apply_act=apply_act,
-                act_kwargs=act_kwargs,
-                **norm_kwargs,
-            )
-        else:
-            self.bn = nn.Sequential()
-            if drop_layer:
-                norm_kwargs['drop_layer'] = drop_layer
-                self.bn.add_module('drop', drop_layer())
-
-        self.aa = create_aa(aa_layer, out_channels, stride=stride, enable=use_aa, noop=None)
-
-    @property
-    def in_channels(self):
-        return self.conv.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv.out_channels
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.aa is not None:
-            x = self.aa(x)
-        return x
-
-
-ConvBnAct = ConvNormAct
-ConvNormActAa = ConvNormAct   # backwards compat, when they were separate
diff --git a/mindnlp/mimm/layers/create_act.py b/mindnlp/mimm/layers/create_act.py
deleted file mode 100644
index 3c4d2ffe3..000000000
--- a/mindnlp/mimm/layers/create_act.py
+++ /dev/null
@@ -1,103 +0,0 @@
-""" Activation Factory
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from typing import Union, Callable, Type
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-from mindnlp.common.activations import QuickGELU, GELUTanh, gelu_tanh, quick_gelu, hard_mish, HardMish
-
-_ACT_FN_DEFAULT = dict(
-    silu=F.silu,
-    swish=F.silu,
-    mish=F.mish,
-    relu=F.relu,
-    relu6=F.relu6,
-    leaky_relu=F.leaky_relu,
-    elu=F.elu,
-    celu=F.celu,
-    selu=F.selu,
-    gelu=F.gelu,
-    gelu_tanh=gelu_tanh,
-    quick_gelu=quick_gelu,
-    sigmoid=F.sigmoid,
-    tanh=F.tanh,
-    hard_sigmoid=F.hardsigmoid,
-    hard_swish=F.hardswish,
-    hard_mish=hard_mish,
-)
-
-
-_ACT_FNS = (_ACT_FN_DEFAULT,)
-for a in _ACT_FNS:
-    a.setdefault('hardsigmoid', a.get('hard_sigmoid'))
-    a.setdefault('hardswish', a.get('hard_swish'))
-
-
-_ACT_LAYER_DEFAULT = dict(
-    silu=nn.SiLU,
-    swish=nn.SiLU,
-    mish=nn.Mish,
-    relu=nn.ReLU,
-    relu6=nn.ReLU6,
-    leaky_relu=nn.LeakyReLU,
-    elu=nn.ELU,
-    prelu=nn.PReLU,
-    celu=nn.CELU,
-    selu=nn.SELU,
-    gelu=nn.GELU,
-    gelu_tanh=GELUTanh,
-    quick_gelu=QuickGELU,
-    sigmoid=nn.Sigmoid,
-    tanh=nn.Tanh,
-    hard_sigmoid=nn.Hardsigmoid,
-    hard_swish=nn.Hardswish,
-    hard_mish=HardMish,
-    identity=nn.Identity,
-)
-
-_ACT_LAYERS = (_ACT_LAYER_DEFAULT,)
-for a in _ACT_LAYERS:
-    a.setdefault('hardsigmoid', a.get('hard_sigmoid'))
-    a.setdefault('hardswish', a.get('hard_swish'))
-
-
-def get_act_fn(name: Union[Callable, str] = 'relu'):
-    """ Activation Function Factory
-    Fetching activation fns by name with this function allows export or torch script friendly
-    functions to be returned dynamically based on current config.
-    """
-    if not name:
-        return None
-    if isinstance(name, Callable):
-        return name
-    name = name.lower()
-    return _ACT_FN_DEFAULT[name]
-
-
-def get_act_layer(name: Union[Type[nn.Module], str] = 'relu'):
-    """ Activation Layer Factory
-    Fetching activation layers by name with this function allows export or torch script friendly
-    functions to be returned dynamically based on current config.
-    """
-    if name is None:
-        return None
-    if not isinstance(name, str):
-        # callable, module, etc
-        return name
-    if not name:
-        return None
-    name = name.lower()
-    return _ACT_LAYER_DEFAULT[name]
-
-
-def create_act_layer(name: Union[Type[nn.Module], str], inplace=None, **kwargs):
-    act_layer = get_act_layer(name)
-    if act_layer is None:
-        return None
-    if inplace is None:
-        return act_layer(**kwargs)
-    try:
-        return act_layer(inplace=inplace, **kwargs)
-    except TypeError:
-        # recover if act layer doesn't have inplace arg
-        return act_layer(**kwargs)
diff --git a/mindnlp/mimm/layers/create_attn.py b/mindnlp/mimm/layers/create_attn.py
deleted file mode 100644
index 0a4b146ef..000000000
--- a/mindnlp/mimm/layers/create_attn.py
+++ /dev/null
@@ -1,90 +0,0 @@
-""" Attention Factory
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from functools import partial
-
-from mindnlp.core import nn
-
-from .bottleneck_attn import BottleneckAttn
-from .cbam import CbamModule, LightCbamModule
-from .eca import EcaModule, CecaModule
-from .gather_excite import GatherExcite
-from .global_context import GlobalContext
-from .halo_attn import HaloAttn
-from .lambda_layer import LambdaLayer
-from .non_local_attn import NonLocalAttn, BatNonLocalAttn
-from .selective_kernel import SelectiveKernel
-from .split_attn import SplitAttn
-from .squeeze_excite import SEModule, EffectiveSEModule
-
-
-def get_attn(attn_type):
-    if isinstance(attn_type, nn.Module):
-        return attn_type
-    module_cls = None
-    if attn_type:
-        if isinstance(attn_type, str):
-            attn_type = attn_type.lower()
-            # Lightweight attention modules (channel and/or coarse spatial).
-            # Typically added to existing network architecture blocks in addition to existing convolutions.
-            if attn_type == 'se':
-                module_cls = SEModule
-            elif attn_type == 'ese':
-                module_cls = EffectiveSEModule
-            elif attn_type == 'eca':
-                module_cls = EcaModule
-            elif attn_type == 'ecam':
-                module_cls = partial(EcaModule, use_mlp=True)
-            elif attn_type == 'ceca':
-                module_cls = CecaModule
-            elif attn_type == 'ge':
-                module_cls = GatherExcite
-            elif attn_type == 'gc':
-                module_cls = GlobalContext
-            elif attn_type == 'gca':
-                module_cls = partial(GlobalContext, fuse_add=True, fuse_scale=False)
-            elif attn_type == 'cbam':
-                module_cls = CbamModule
-            elif attn_type == 'lcbam':
-                module_cls = LightCbamModule
-
-            # Attention / attention-like modules w/ significant params
-            # Typically replace some of the existing workhorse convs in a network architecture.
-            # All of these accept a stride argument and can spatially downsample the input.
-            elif attn_type == 'sk':
-                module_cls = SelectiveKernel
-            elif attn_type == 'splat':
-                module_cls = SplitAttn
-
-            # Self-attention / attention-like modules w/ significant compute and/or params
-            # Typically replace some of the existing workhorse convs in a network architecture.
-            # All of these accept a stride argument and can spatially downsample the input.
-            elif attn_type == 'lambda':
-                return LambdaLayer
-            elif attn_type == 'bottleneck':
-                return BottleneckAttn
-            elif attn_type == 'halo':
-                return HaloAttn
-            elif attn_type == 'nl':
-                module_cls = NonLocalAttn
-            elif attn_type == 'bat':
-                module_cls = BatNonLocalAttn
-
-            # Woops!
-            else:
-                assert False, "Invalid attn module (%s)" % attn_type
-        elif isinstance(attn_type, bool):
-            if attn_type:
-                module_cls = SEModule
-        else:
-            module_cls = attn_type
-    return module_cls
-
-
-def create_attn(attn_type, channels, **kwargs):
-    module_cls = get_attn(attn_type)
-    if module_cls is not None:
-        # NOTE: it's expected the first (positional) argument of all attention layers is the # input channels
-        return module_cls(channels, **kwargs)
-    return None
diff --git a/mindnlp/mimm/layers/create_conv2d.py b/mindnlp/mimm/layers/create_conv2d.py
deleted file mode 100644
index ac9489ce4..000000000
--- a/mindnlp/mimm/layers/create_conv2d.py
+++ /dev/null
@@ -1,36 +0,0 @@
-""" Create Conv2d Factory Method
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from .mixed_conv2d import MixedConv2d
-from .cond_conv2d import CondConv2d
-from .conv2d_same import create_conv2d_pad
-
-
-def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
-    """ Select a 2d convolution implementation based on arguments
-    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
-
-    Used extensively by EfficientNet, MobileNetv3 and related networks.
-    """
-    if isinstance(kernel_size, list):
-        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
-        if 'groups' in kwargs:
-            groups = kwargs.pop('groups')
-            if groups == in_channels:
-                kwargs['depthwise'] = True
-            else:
-                assert groups == 1
-        # We're going to use only lists for defining the MixedConv2d kernel groups,
-        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
-    else:
-        depthwise = kwargs.pop('depthwise', False)
-        # for DW out_channels must be multiple of in_channels as must have out_channels % groups == 0
-        groups = in_channels if depthwise else kwargs.pop('groups', 1)
-        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
-        else:
-            m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
-    return m
diff --git a/mindnlp/mimm/layers/create_norm.py b/mindnlp/mimm/layers/create_norm.py
deleted file mode 100644
index 3d986e269..000000000
--- a/mindnlp/mimm/layers/create_norm.py
+++ /dev/null
@@ -1,57 +0,0 @@
-""" Norm Layer Factory
-
-Create norm modules by string (to mirror create_act and creat_norm-act fns)
-
-Copyright 2022 Ross Wightman
-"""
-import functools
-import types
-
-from mindnlp.core import nn
-
-from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d, RmsNorm, RmsNorm2d
-# from torchvision.ops.misc import FrozenBatchNorm2d
-
-_NORM_MAP = dict(
-    batchnorm=nn.BatchNorm2d,
-    batchnorm2d=nn.BatchNorm2d,
-    batchnorm1d=nn.BatchNorm1d,
-    groupnorm=GroupNorm,
-    groupnorm1=GroupNorm1,
-    layernorm=LayerNorm,
-    layernorm2d=LayerNorm2d,
-    rmsnorm=RmsNorm,
-    rmsnorm2d=RmsNorm2d,
-    # frozenbatchnorm2d=FrozenBatchNorm2d,
-)
-_NORM_TYPES = {m for n, m in _NORM_MAP.items()}
-
-
-def create_norm_layer(layer_name, num_features, **kwargs):
-    layer = get_norm_layer(layer_name)
-    layer_instance = layer(num_features, **kwargs)
-    return layer_instance
-
-
-def get_norm_layer(norm_layer):
-    if norm_layer is None:
-        return None
-    assert isinstance(norm_layer, (type, str, types.FunctionType, functools.partial))
-    norm_kwargs = {}
-
-    # unbind partial fn, so args can be rebound later
-    if isinstance(norm_layer, functools.partial):
-        norm_kwargs.update(norm_layer.keywords)
-        norm_layer = norm_layer.func
-
-    if isinstance(norm_layer, str):
-        if not norm_layer:
-            return None
-        layer_name = norm_layer.replace('_', '').lower()
-        norm_layer = _NORM_MAP[layer_name]
-    else:
-        pass
-
-    if norm_kwargs:
-        norm_layer = functools.partial(norm_layer, **norm_kwargs)  # bind/rebind args
-    return norm_layer
diff --git a/mindnlp/mimm/layers/create_norm_act.py b/mindnlp/mimm/layers/create_norm_act.py
deleted file mode 100644
index 7fcb968a1..000000000
--- a/mindnlp/mimm/layers/create_norm_act.py
+++ /dev/null
@@ -1,93 +0,0 @@
-""" NormAct (Normalizaiton + Activation Layer) Factory
-
-Create norm + act combo modules that attempt to be backwards compatible with separate norm + act
-isntances in models. Where these are used it will be possible to swap separate BN + act layers with
-combined modules like IABN or EvoNorms.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import types
-import functools
-
-from .evo_norm import EvoNorm2dB0, EvoNorm2dB1, EvoNorm2dB2, EvoNorm2dS0, EvoNorm2dS0a, EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a
-from .filter_response_norm import FilterResponseNormAct2d, FilterResponseNormTlu2d
-from .norm_act import BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d
-# from .inplace_abn import InplaceAbn
-
-_NORM_ACT_MAP = dict(
-    batchnorm=BatchNormAct2d,
-    batchnorm2d=BatchNormAct2d,
-    groupnorm=GroupNormAct,
-    groupnorm1=functools.partial(GroupNormAct, num_groups=1),
-    layernorm=LayerNormAct,
-    layernorm2d=LayerNormAct2d,
-    evonormb0=EvoNorm2dB0,
-    evonormb1=EvoNorm2dB1,
-    evonormb2=EvoNorm2dB2,
-    evonorms0=EvoNorm2dS0,
-    evonorms0a=EvoNorm2dS0a,
-    evonorms1=EvoNorm2dS1,
-    evonorms1a=EvoNorm2dS1a,
-    evonorms2=EvoNorm2dS2,
-    evonorms2a=EvoNorm2dS2a,
-    frn=FilterResponseNormAct2d,
-    frntlu=FilterResponseNormTlu2d,
-    # inplaceabn=InplaceAbn,
-    # iabn=InplaceAbn,
-)
-_NORM_ACT_TYPES = {m for n, m in _NORM_ACT_MAP.items()}
-# has act_layer arg to define act type
-_NORM_ACT_REQUIRES_ARG = {
-    BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d, FilterResponseNormAct2d}#, InplaceAbn}
-
-
-def create_norm_act_layer(layer_name, num_features, act_layer=None, apply_act=True, jit=False, **kwargs):
-    layer = get_norm_act_layer(layer_name, act_layer=act_layer)
-    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
-    return layer_instance
-
-
-def get_norm_act_layer(norm_layer, act_layer=None):
-    if norm_layer is None:
-        return None
-    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
-    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
-    norm_act_kwargs = {}
-
-    # unbind partial fn, so args can be rebound later
-    if isinstance(norm_layer, functools.partial):
-        norm_act_kwargs.update(norm_layer.keywords)
-        norm_layer = norm_layer.func
-
-    if isinstance(norm_layer, str):
-        if not norm_layer:
-            return None
-        layer_name = norm_layer.replace('_', '').lower().split('-')[0]
-        norm_act_layer = _NORM_ACT_MAP[layer_name]
-    elif norm_layer in _NORM_ACT_TYPES:
-        norm_act_layer = norm_layer
-    elif isinstance(norm_layer,  types.FunctionType):
-        # if function type, must be a lambda/fn that creates a norm_act layer
-        norm_act_layer = norm_layer
-    else:
-        type_name = norm_layer.__name__.lower()
-        if type_name.startswith('batchnorm'):
-            norm_act_layer = BatchNormAct2d
-        elif type_name.startswith('groupnorm'):
-            norm_act_layer = GroupNormAct
-        elif type_name.startswith('groupnorm1'):
-            norm_act_layer = functools.partial(GroupNormAct, num_groups=1)
-        elif type_name.startswith('layernorm2d'):
-            norm_act_layer = LayerNormAct2d
-        elif type_name.startswith('layernorm'):
-            norm_act_layer = LayerNormAct
-        else:
-            assert False, f"No equivalent norm_act layer for {type_name}"
-
-    if norm_act_layer in _NORM_ACT_REQUIRES_ARG:
-        # pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
-        # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types
-        norm_act_kwargs.setdefault('act_layer', act_layer)
-    if norm_act_kwargs:
-        norm_act_layer = functools.partial(norm_act_layer, **norm_act_kwargs)  # bind/rebind args
-    return norm_act_layer
diff --git a/mindnlp/mimm/layers/drop.py b/mindnlp/mimm/layers/drop.py
deleted file mode 100644
index db7504276..000000000
--- a/mindnlp/mimm/layers/drop.py
+++ /dev/null
@@ -1,182 +0,0 @@
-""" DropBlock, DropPath
-
-PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
-
-Papers:
-DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
-
-Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
-
-Code:
-DropBlock impl inspired by two Tensorflow impl that I liked:
- - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
- - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .grid import ndgrid
-
-
-def drop_block_2d(
-        x,
-        drop_prob: float = 0.1,
-        block_size: int = 7,
-        gamma_scale: float = 1.0,
-        with_noise: bool = False,
-        inplace: bool = False,
-        batchwise: bool = False
-):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-
-    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
-    runs with success, but needs further validation and possibly optimization for lower runtime impact.
-    """
-    B, C, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    # seed_drop_rate, the gamma parameter
-    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
-            (W - block_size + 1) * (H - block_size + 1))
-
-    # Forces the block to be inside the feature map.
-    w_i, h_i = ndgrid(ops.arange(W), ops.arange(H))
-    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \
-                  ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2))
-    valid_block = ops.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype)
-
-    if batchwise:
-        # one mask for whole batch, quite a bit faster
-        uniform_noise = ops.rand((1, C, H, W), dtype=x.dtype)
-    else:
-        uniform_noise = ops.rand_like(x)
-    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
-    block_mask = -F.max_pool2d(
-        -block_mask,
-        kernel_size=clipped_block_size,  # block_size,
-        stride=1,
-        padding=clipped_block_size // 2)
-
-    if with_noise:
-        normal_noise = ops.randn((1, C, H, W), dtype=x.dtype) if batchwise else ops.randn_like(x)
-        if inplace:
-            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
-        else:
-            x = x * block_mask + normal_noise * (1 - block_mask)
-    else:
-        normalize_scale = (block_mask.numel() / block_mask.to(dtype=mindspore.float32).sum().add(1e-7)).to(x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-
-
-def drop_block_fast_2d(
-        x: mindspore.Tensor,
-        drop_prob: float = 0.1,
-        block_size: int = 7,
-        gamma_scale: float = 1.0,
-        with_noise: bool = False,
-        inplace: bool = False,
-):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-
-    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
-    block mask at edges.
-    """
-    B, C, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
-            (W - block_size + 1) * (H - block_size + 1))
-
-    block_mask = ops.empty_like(x).bernoulli_(gamma)
-    block_mask = F.max_pool2d(
-        block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2)
-
-    if with_noise:
-        normal_noise = ops.empty_like(x).normal_()
-        if inplace:
-            x.mul_(1. - block_mask).add_(normal_noise * block_mask)
-        else:
-            x = x * (1. - block_mask) + normal_noise * block_mask
-    else:
-        block_mask = 1 - block_mask
-        normalize_scale = (block_mask.numel() / block_mask.to(dtype=mindspore.float32).sum().add(1e-6)).to(dtype=x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-
-
-class DropBlock2d(nn.Module):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-    """
-
-    def __init__(
-            self,
-            drop_prob: float = 0.1,
-            block_size: int = 7,
-            gamma_scale: float = 1.0,
-            with_noise: bool = False,
-            inplace: bool = False,
-            batchwise: bool = False,
-            fast: bool = True):
-        super(DropBlock2d, self).__init__()
-        self.drop_prob = drop_prob
-        self.gamma_scale = gamma_scale
-        self.block_size = block_size
-        self.with_noise = with_noise
-        self.inplace = inplace
-        self.batchwise = batchwise
-        self.fast = fast  # FIXME finish comparisons of fast vs not
-
-    def forward(self, x):
-        if not self.training or not self.drop_prob:
-            return x
-        if self.fast:
-            return drop_block_fast_2d(
-                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace)
-        else:
-            return drop_block_2d(
-                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
-
-
-def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob,3):0.3f}'
diff --git a/mindnlp/mimm/layers/eca.py b/mindnlp/mimm/layers/eca.py
deleted file mode 100644
index 888f066c8..000000000
--- a/mindnlp/mimm/layers/eca.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-ECA module from ECAnet
-
-paper: ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks
-https://arxiv.org/abs/1910.03151
-
-Original ECA model borrowed from https://github.com/BangguWu/ECANet
-
-Modified circular ECA implementation and adaption for use in timm package
-by Chris Ha https://github.com/VRandme
-
-Original License:
-
-MIT License
-
-Copyright (c) 2019 BangguWu, Qilong Wang
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import math
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-
-from .create_act import create_act_layer
-from .helpers import make_divisible
-
-
-class EcaModule(nn.Module):
-    """Constructs an ECA module.
-
-    Args:
-        channels: Number of channels of the input feature map for use in adaptive kernel sizes
-            for actual calculations according to channel.
-            gamma, beta: when channel is given parameters of mapping function
-            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
-            (default=None. if channel size not given, use k_size given for kernel size.)
-        kernel_size: Adaptive selection of kernel size (default=3)
-        gamm: used in kernel_size calc, see above
-        beta: used in kernel_size calc, see above
-        act_layer: optional non-linearity after conv, enables conv bias, this is an experiment
-        gate_layer: gating non-linearity to use
-    """
-    def __init__(
-            self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid',
-            rd_ratio=1/8, rd_channels=None, rd_divisor=8, use_mlp=False):
-        super(EcaModule, self).__init__()
-        if channels is not None:
-            t = int(abs(math.log(channels, 2) + beta) / gamma)
-            kernel_size = max(t if t % 2 else t + 1, 3)
-        assert kernel_size % 2 == 1
-        padding = (kernel_size - 1) // 2
-        if use_mlp:
-            # NOTE 'mlp' mode is a timm experiment, not in paper
-            assert channels is not None
-            if rd_channels is None:
-                rd_channels = make_divisible(channels * rd_ratio, divisor=rd_divisor)
-            act_layer = act_layer or nn.ReLU
-            self.conv = nn.Conv1d(1, rd_channels, kernel_size=1, padding=0, bias=True)
-            self.act = create_act_layer(act_layer)
-            self.conv2 = nn.Conv1d(rd_channels, 1, kernel_size=kernel_size, padding=padding, bias=True)
-        else:
-            self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
-            self.act = None
-            self.conv2 = None
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        y = x.mean((2, 3)).view(x.shape[0], 1, -1)  # view for 1d conv
-        y = self.conv(y)
-        if self.conv2 is not None:
-            y = self.act(y)
-            y = self.conv2(y)
-        y = self.gate(y).view(x.shape[0], -1, 1, 1)
-        return x * y.expand_as(x)
-
-
-EfficientChannelAttn = EcaModule  # alias
-
-
-class CecaModule(nn.Module):
-    """Constructs a circular ECA module.
-
-    ECA module where the conv uses circular padding rather than zero padding.
-    Unlike the spatial dimension, the channels do not have inherent ordering nor
-    locality. Although this module in essence, applies such an assumption, it is unnecessary
-    to limit the channels on either "edge" from being circularly adapted to each other.
-    This will fundamentally increase connectivity and possibly increase performance metrics
-    (accuracy, robustness), without significantly impacting resource metrics
-    (parameter size, throughput,latency, etc)
-
-    Args:
-        channels: Number of channels of the input feature map for use in adaptive kernel sizes
-            for actual calculations according to channel.
-            gamma, beta: when channel is given parameters of mapping function
-            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
-            (default=None. if channel size not given, use k_size given for kernel size.)
-        kernel_size: Adaptive selection of kernel size (default=3)
-        gamm: used in kernel_size calc, see above
-        beta: used in kernel_size calc, see above
-        act_layer: optional non-linearity after conv, enables conv bias, this is an experiment
-        gate_layer: gating non-linearity to use
-    """
-
-    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'):
-        super(CecaModule, self).__init__()
-        if channels is not None:
-            t = int(abs(math.log(channels, 2) + beta) / gamma)
-            kernel_size = max(t if t % 2 else t + 1, 3)
-        has_act = act_layer is not None
-        assert kernel_size % 2 == 1
-
-        # PyTorch circular padding mode is buggy as of pytorch 1.4
-        # see https://github.com/pytorch/pytorch/pull/17240
-        # implement manual circular padding
-        self.padding = (kernel_size - 1) // 2
-        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=has_act)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        y = x.mean((2, 3)).view(x.shape[0], 1, -1)
-        # Manually implement circular padding, F.pad does not seemed to be bugged
-        y = F.pad(y, (self.padding, self.padding), mode='circular')
-        y = self.conv(y)
-        y = self.gate(y).view(x.shape[0], -1, 1, 1)
-        return x * y.expand_as(x)
-
-
-CircularEfficientChannelAttn = CecaModule
diff --git a/mindnlp/mimm/layers/evo_norm.py b/mindnlp/mimm/layers/evo_norm.py
deleted file mode 100644
index 4e114f3a2..000000000
--- a/mindnlp/mimm/layers/evo_norm.py
+++ /dev/null
@@ -1,343 +0,0 @@
-""" EvoNorm in MindSpore
-
-Based on `Evolving Normalization-Activation Layers` - https://arxiv.org/abs/2004.02967
-@inproceedings{NEURIPS2020,
- author = {Liu, Hanxiao and Brock, Andy and Simonyan, Karen and Le, Quoc},
- booktitle = {Advances in Neural Information Processing Systems},
- editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
- pages = {13539--13550},
- publisher = {Curran Associates, Inc.},
- title = {Evolving Normalization-Activation Layers},
- url = {https://proceedings.neurips.cc/paper/2020/file/9d4c03631b8b0c85ae08bf05eda37d0f-Paper.pdf},
- volume = {33},
- year = {2020}
-}
-
-An attempt at getting decent performing EvoNorms running in PyTorch.
-While faster than other PyTorch impl, still quite a ways off the built-in BatchNorm
-in terms of memory usage and throughput on GPUs.
-
-"""
-from typing import Sequence, Union
-
-from mindnlp.core import nn, ops
-
-from .create_act import create_act_layer
-
-
-def instance_std(x, eps: float = 1e-5):
-    std = x.float().var(dim=(2, 3), unbiased=False, keepdim=True).add(eps).sqrt().to(x.dtype)
-    return std.expand(x.shape)
-
-
-def instance_std_tpu(x, eps: float = 1e-5):
-    std = manual_var(x, dim=(2, 3)).add(eps).sqrt()
-    return std.expand(x.shape)
-# instance_std = instance_std_tpu
-
-
-def instance_rms(x, eps: float = 1e-5):
-    rms = x.float().square().mean(dim=(2, 3), keepdim=True).add(eps).sqrt().to(x.dtype)
-    return rms.expand(x.shape)
-
-
-def manual_var(x, dim: Union[int, Sequence[int]], diff_sqm: bool = False):
-    xm = x.mean(dim=dim, keepdim=True)
-    if diff_sqm:
-        # difference of squared mean and mean squared, faster on TPU can be less stable
-        var = ((x * x).mean(dim=dim, keepdim=True) - (xm * xm)).clamp(0)
-    else:
-        var = ((x - xm) * (x - xm)).mean(dim=dim, keepdim=True)
-    return var
-
-
-def group_std(x, groups: int = 32, eps: float = 1e-5, flatten: bool = False):
-    B, C, H, W = x.shape
-    x_dtype = x.dtype
-    assert C % groups == 0, ''
-    if flatten:
-        x = x.reshape(B, groups, -1)
-        std = x.float().var(dim=2, unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype)
-    else:
-        x = x.reshape(B, groups, C // groups, H, W)
-        std = x.float().var(dim=(2, 3, 4), unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype)
-    return std.expand(x.shape).reshape(B, C, H, W)
-
-
-def group_std_tpu(x, groups: int = 32, eps: float = 1e-5, diff_sqm: bool = False, flatten: bool = False):
-    # This is a workaround for some stability / odd behaviour of .var and .std
-    # running on PyTorch XLA w/ TPUs. These manual var impl are producing much better results
-    B, C, H, W = x.shape
-    assert C % groups == 0, ''
-    if flatten:
-        x = x.reshape(B, groups, -1)
-        var = manual_var(x, dim=-1, diff_sqm=diff_sqm)
-    else:
-        x = x.reshape(B, groups, C // groups, H, W)
-        var = manual_var(x, dim=(2, 3, 4), diff_sqm=diff_sqm)
-    return var.add(eps).sqrt().expand(x.shape).reshape(B, C, H, W)
-
-
-def group_rms(x, groups: int = 32, eps: float = 1e-5):
-    B, C, H, W = x.shape
-    assert C % groups == 0, ''
-    x_dtype = x.dtype
-    x = x.reshape(B, groups, C // groups, H, W)
-    rms = x.float().square().mean(dim=(2, 3, 4), keepdim=True).add(eps).sqrt_().to(x_dtype)
-    return rms.expand(x.shape).reshape(B, C, H, W)
-
-
-class EvoNorm2dB0(nn.Module):
-    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-3, **_):
-        super().__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.momentum = momentum
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.v = nn.Parameter(ops.ones(num_features)) if apply_act else None
-        self.register_buffer('running_var', ops.ones(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-        if self.v is not None:
-            nn.init.ones_(self.v)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.v is not None:
-            if self.training:
-                var = x.float().var(dim=(0, 2, 3), unbiased=False)
-                # var = manual_var(x, dim=(0, 2, 3)).squeeze()
-                n = x.numel() / x.shape[1]
-                self.running_var.copy_(
-                    self.running_var * (1 - self.momentum) +
-                    var.detach() * self.momentum * (n / (n - 1)))
-            else:
-                var = self.running_var
-            left = var.add(self.eps).sqrt_().to(x_dtype).view(v_shape).expand_as(x)
-            v = self.v.to(x_dtype).view(v_shape)
-            right = x * v + instance_std(x, self.eps)
-            x = x / left.max(right)
-        return x * self.weight.to(x_dtype).view(v_shape) + self.bias.to(x_dtype).view(v_shape)
-
-
-class EvoNorm2dB1(nn.Module):
-    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, **_):
-        super().__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.momentum = momentum
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.register_buffer('running_var', ops.ones(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.apply_act:
-            if self.training:
-                var = x.float().var(dim=(0, 2, 3), unbiased=False)
-                n = x.numel() / x.shape[1]
-                self.running_var.copy_(
-                    self.running_var * (1 - self.momentum) +
-                    var.detach().to(self.running_var.dtype) * self.momentum * (n / (n - 1)))
-            else:
-                var = self.running_var
-            var = var.to(x_dtype).view(v_shape)
-            left = var.add(self.eps).sqrt_()
-            right = (x + 1) * instance_rms(x, self.eps)
-            x = x / left.max(right)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dB2(nn.Module):
-    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, **_):
-        super().__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.momentum = momentum
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.register_buffer('running_var', ops.ones(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.apply_act:
-            if self.training:
-                var = x.float().var(dim=(0, 2, 3), unbiased=False)
-                n = x.numel() / x.shape[1]
-                self.running_var.copy_(
-                    self.running_var * (1 - self.momentum) +
-                    var.detach().to(self.running_var.dtype) * self.momentum * (n / (n - 1)))
-            else:
-                var = self.running_var
-            var = var.to(x_dtype).view(v_shape)
-            left = var.add(self.eps).sqrt_()
-            right = instance_rms(x, self.eps) - x
-            x = x / left.max(right)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS0(nn.Module):
-    def __init__(self, num_features, groups=32, group_size=None, apply_act=True, eps=1e-5, **_):
-        super().__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        if group_size:
-            assert num_features % group_size == 0
-            self.groups = num_features // group_size
-        else:
-            self.groups = groups
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.v = nn.Parameter(ops.ones(num_features)) if apply_act else None
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-        if self.v is not None:
-            nn.init.ones_(self.v)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.v is not None:
-            v = self.v.view(v_shape).to(x_dtype)
-            x = x * (x * v).sigmoid() / group_std(x, self.groups, self.eps)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS0a(EvoNorm2dS0):
-    def __init__(self, num_features, groups=32, group_size=None, apply_act=True, eps=1e-3, **_):
-        super().__init__(
-            num_features, groups=groups, group_size=group_size, apply_act=apply_act, eps=eps)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        d = group_std(x, self.groups, self.eps)
-        if self.v is not None:
-            v = self.v.view(v_shape).to(x_dtype)
-            x = x * (x * v).sigmoid()
-        x = x / d
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS1(nn.Module):
-    def __init__(
-            self, num_features, groups=32, group_size=None,
-            apply_act=True, act_layer=None, eps=1e-5, **_):
-        super().__init__()
-        act_layer = act_layer or nn.SiLU
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        if act_layer is not None and apply_act:
-            self.act = create_act_layer(act_layer)
-        else:
-            self.act = nn.Identity()
-        if group_size:
-            assert num_features % group_size == 0
-            self.groups = num_features // group_size
-        else:
-            self.groups = groups
-        self.eps = eps
-        self.pre_act_norm = False
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.apply_act:
-            x = self.act(x) / group_std(x, self.groups, self.eps)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS1a(EvoNorm2dS1):
-    def __init__(
-            self, num_features, groups=32, group_size=None,
-            apply_act=True, act_layer=None, eps=1e-3, **_):
-        super().__init__(
-            num_features, groups=groups, group_size=group_size, apply_act=apply_act, act_layer=act_layer, eps=eps)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        x = self.act(x) / group_std(x, self.groups, self.eps)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS2(nn.Module):
-    def __init__(
-            self, num_features, groups=32, group_size=None,
-            apply_act=True, act_layer=None, eps=1e-5, **_):
-        super().__init__()
-        act_layer = act_layer or nn.SiLU
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        if act_layer is not None and apply_act:
-            self.act = create_act_layer(act_layer)
-        else:
-            self.act = nn.Identity()
-        if group_size:
-            assert num_features % group_size == 0
-            self.groups = num_features // group_size
-        else:
-            self.groups = groups
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        if self.apply_act:
-            x = self.act(x) / group_rms(x, self.groups, self.eps)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
-
-
-class EvoNorm2dS2a(EvoNorm2dS2):
-    def __init__(
-            self, num_features, groups=32, group_size=None,
-            apply_act=True, act_layer=None, eps=1e-3, **_):
-        super().__init__(
-            num_features, groups=groups, group_size=group_size, apply_act=apply_act, act_layer=act_layer, eps=eps)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        x = self.act(x) / group_rms(x, self.groups, self.eps)
-        return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype)
diff --git a/mindnlp/mimm/layers/filter_response_norm.py b/mindnlp/mimm/layers/filter_response_norm.py
deleted file mode 100644
index 696ff8c42..000000000
--- a/mindnlp/mimm/layers/filter_response_norm.py
+++ /dev/null
@@ -1,66 +0,0 @@
-""" Filter Response Norm in MindSpore
-
-Based on `Filter Response Normalization Layer` - https://arxiv.org/abs/1911.09737
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from mindnlp.core import nn, ops
-
-from .create_act import create_act_layer
-
-
-def inv_instance_rms(x, eps: float = 1e-5):
-    rms = x.square().float().mean(dim=(2, 3), keepdim=True).add(eps).rsqrt().to(x.dtype)
-    return rms.expand(x.shape)
-
-
-class FilterResponseNormTlu2d(nn.Module):
-    def __init__(self, num_features, apply_act=True, eps=1e-5, rms=True, **_):
-        super(FilterResponseNormTlu2d, self).__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.rms = rms
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.tau = nn.Parameter(ops.zeros(num_features)) if apply_act else None
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-        if self.tau is not None:
-            nn.init.zeros_(self.tau)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        x = x * inv_instance_rms(x, self.eps)
-        x = x * self.weight.view(v_shape).to(dtype=x_dtype) + self.bias.view(v_shape).to(dtype=x_dtype)
-        return ops.maximum(x, self.tau.reshape(v_shape).to(dtype=x_dtype)) if self.tau is not None else x
-
-
-class FilterResponseNormAct2d(nn.Module):
-    def __init__(self, num_features, apply_act=True, act_layer=nn.ReLU, inplace=None, rms=True, eps=1e-5, **_):
-        super(FilterResponseNormAct2d, self).__init__()
-        if act_layer is not None and apply_act:
-            self.act = create_act_layer(act_layer, inplace=inplace)
-        else:
-            self.act = nn.Identity()
-        self.rms = rms
-        self.eps = eps
-        self.weight = nn.Parameter(ops.ones(num_features))
-        self.bias = nn.Parameter(ops.zeros(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_dtype = x.dtype
-        v_shape = (1, -1, 1, 1)
-        x = x * inv_instance_rms(x, self.eps)
-        x = x * self.weight.view(v_shape).to(dtype=x_dtype) + self.bias.view(v_shape).to(dtype=x_dtype)
-        return self.act(x)
diff --git a/mindnlp/mimm/layers/format.py b/mindnlp/mimm/layers/format.py
deleted file mode 100644
index 7e6c0f137..000000000
--- a/mindnlp/mimm/layers/format.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""format"""
-from enum import Enum
-from typing import Union
-
-import mindspore
-
-
-class Format(str, Enum):
-    NCHW = 'NCHW'
-    NHWC = 'NHWC'
-    NCL = 'NCL'
-    NLC = 'NLC'
-
-
-FormatT = Union[str, Format]
-
-
-def get_spatial_dim(fmt: FormatT):
-    fmt = Format(fmt)
-    if fmt is Format.NLC:
-        dim = (1,)
-    elif fmt is Format.NCL:
-        dim = (2,)
-    elif fmt is Format.NHWC:
-        dim = (1, 2)
-    else:
-        dim = (2, 3)
-    return dim
-
-
-def get_channel_dim(fmt: FormatT):
-    fmt = Format(fmt)
-    if fmt is Format.NHWC:
-        dim = 3
-    elif fmt is Format.NLC:
-        dim = 2
-    else:
-        dim = 1
-    return dim
-
-
-def nchw_to(x: mindspore.Tensor, fmt: Format):
-    if fmt == Format.NHWC:
-        x = x.permute(0, 2, 3, 1)
-    elif fmt == Format.NLC:
-        x = x.flatten(2).transpose(1, 2)
-    elif fmt == Format.NCL:
-        x = x.flatten(2)
-    return x
-
-
-def nhwc_to(x: mindspore.Tensor, fmt: Format):
-    if fmt == Format.NCHW:
-        x = x.permute(0, 3, 1, 2)
-    elif fmt == Format.NLC:
-        x = x.flatten(1, 2)
-    elif fmt == Format.NCL:
-        x = x.flatten(1, 2).transpose(1, 2)
-    return x
diff --git a/mindnlp/mimm/layers/gather_excite.py b/mindnlp/mimm/layers/gather_excite.py
deleted file mode 100644
index 4b6d0a4ee..000000000
--- a/mindnlp/mimm/layers/gather_excite.py
+++ /dev/null
@@ -1,90 +0,0 @@
-""" Gather-Excite Attention Block
-
-Paper: `Gather-Excite: Exploiting Feature Context in CNNs` - https://arxiv.org/abs/1810.12348
-
-Official code here, but it's only partial impl in Caffe: https://github.com/hujie-frank/GENet
-
-I've tried to support all of the extent both w/ and w/o params. I don't believe I've seen another
-impl that covers all of the cases.
-
-NOTE: extent=0 + extra_params=False is equivalent to Squeeze-and-Excitation
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-import math
-
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .create_act import create_act_layer, get_act_layer
-from .create_conv2d import create_conv2d
-from .helpers import make_divisible
-from .mlp import ConvMlp
-
-
-class GatherExcite(nn.Module):
-    """ Gather-Excite Attention Module
-    """
-    def __init__(
-            self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True,
-            rd_ratio=1./16, rd_channels=None,  rd_divisor=1, add_maxpool=False,
-            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'):
-        super(GatherExcite, self).__init__()
-        self.add_maxpool = add_maxpool
-        act_layer = get_act_layer(act_layer)
-        self.extent = extent
-        if extra_params:
-            self.gather = nn.Sequential()
-            if extent == 0:
-                assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params'
-                self.gather.add_module(
-                    'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True))
-                if norm_layer:
-                    self.gather.add_module('norm1', nn.BatchNorm2d(channels))
-            else:
-                assert extent % 2 == 0
-                num_conv = int(math.log2(extent))
-                for i in range(num_conv):
-                    self.gather.add_module(
-                        f'conv{i + 1}',
-                        create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True))
-                    if norm_layer:
-                        self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels))
-                    if i != num_conv - 1:
-                        self.gather.add_module(f'act{i + 1}', act_layer(inplace=True))
-        else:
-            self.gather = None
-            if self.extent == 0:
-                self.gk = 0
-                self.gs = 0
-            else:
-                assert extent % 2 == 0
-                self.gk = self.extent * 2 - 1
-                self.gs = self.extent
-
-        if not rd_channels:
-            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
-        self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity()
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        size = x.shape[-2:]
-        if self.gather is not None:
-            x_ge = self.gather(x)
-        else:
-            if self.extent == 0:
-                # global extent
-                x_ge = x.mean(dim=(2, 3), keepdims=True)
-                if self.add_maxpool:
-                    # experimental codepath, may remove or change
-                    x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True)
-            else:
-                x_ge = F.avg_pool2d(
-                    x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False)
-                if self.add_maxpool:
-                    # experimental codepath, may remove or change
-                    x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2)
-        x_ge = self.mlp(x_ge)
-        if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1:
-            x_ge = F.interpolate(x_ge, size=size)
-        return x * self.gate(x_ge)
diff --git a/mindnlp/mimm/layers/global_context.py b/mindnlp/mimm/layers/global_context.py
deleted file mode 100644
index cb6a3e790..000000000
--- a/mindnlp/mimm/layers/global_context.py
+++ /dev/null
@@ -1,67 +0,0 @@
-""" Global Context Attention Block
-
-Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond`
-    - https://arxiv.org/abs/1904.11492
-
-Official code consulted as reference: https://github.com/xvjiarui/GCNet
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .create_act import create_act_layer, get_act_layer
-from .helpers import make_divisible
-from .mlp import ConvMlp
-from .norm import LayerNorm2d
-
-
-class GlobalContext(nn.Module):
-
-    def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False,
-                 rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'):
-        super(GlobalContext, self).__init__()
-        act_layer = get_act_layer(act_layer)
-
-        self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None
-
-        if rd_channels is None:
-            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
-        if fuse_add:
-            self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
-        else:
-            self.mlp_add = None
-        if fuse_scale:
-            self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
-        else:
-            self.mlp_scale = None
-
-        self.gate = create_act_layer(gate_layer)
-        self.init_last_zero = init_last_zero
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.conv_attn is not None:
-            nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu')
-        if self.mlp_add is not None:
-            nn.init.zeros_(self.mlp_add.fc2.weight)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-
-        if self.conv_attn is not None:
-            attn = self.conv_attn(x).reshape(B, 1, H * W)  # (B, 1, H * W)
-            attn = F.softmax(attn, dim=-1).unsqueeze(3)  # (B, 1, H * W, 1)
-            context = x.reshape(B, C, H * W).unsqueeze(1) @ attn
-            context = context.view(B, C, 1, 1)
-        else:
-            context = x.mean(dim=(2, 3), keepdim=True)
-
-        if self.mlp_scale is not None:
-            mlp_x = self.mlp_scale(context)
-            x = x * self.gate(mlp_x)
-        if self.mlp_add is not None:
-            mlp_x = self.mlp_add(context)
-            x = x + mlp_x
-
-        return x
diff --git a/mindnlp/mimm/layers/grid.py b/mindnlp/mimm/layers/grid.py
deleted file mode 100644
index eb989a5d5..000000000
--- a/mindnlp/mimm/layers/grid.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""grid"""
-from typing import Tuple
-
-import mindspore
-from mindnlp.core import ops
-
-def ndgrid(*tensors) -> Tuple[mindspore.Tensor, ...]:
-    """generate N-D grid in dimension order.
-
-    The ndgrid function is like meshgrid except that the order of the first two input arguments are switched.
-
-    That is, the statement
-    [X1,X2,X3] = ndgrid(x1,x2,x3)
-
-    produces the same result as
-
-    [X2,X1,X3] = meshgrid(x2,x1,x3)
-
-    This naming is based on MATLAB, the purpose is to avoid confusion due to torch's change to make
-    ops.meshgrid behaviour move from matching ndgrid ('ij') indexing to numpy meshgrid defaults of ('xy').
-
-    """
-    try:
-        return ops.meshgrid(*tensors, indexing='ij')
-    except TypeError:
-        # old PyTorch < 1.10 will follow this path as it does not have indexing arg,
-        # the old behaviour of meshgrid was 'ij'
-        return ops.meshgrid(*tensors)
-
-
-def meshgrid(*tensors) -> Tuple[mindspore.Tensor, ...]:
-    """generate N-D grid in spatial dim order.
-
-    The meshgrid function is similar to ndgrid except that the order of the
-    first two input and output arguments is switched.
-
-    That is, the statement
-
-    [X,Y,Z] = meshgrid(x,y,z)
-    produces the same result as
-
-    [Y,X,Z] = ndgrid(y,x,z)
-    Because of this, meshgrid is better suited to problems in two- or three-dimensional Cartesian space,
-    while ndgrid is better suited to multidimensional problems that aren't spatially based.
-    """
-
-    # NOTE: this will throw in PyTorch < 1.10 as meshgrid did not support indexing arg or have
-    # capability of generating grid in xy order before then.
-    return ops.meshgrid(*tensors, indexing='xy')
diff --git a/mindnlp/mimm/layers/grn.py b/mindnlp/mimm/layers/grn.py
deleted file mode 100644
index e45713e5f..000000000
--- a/mindnlp/mimm/layers/grn.py
+++ /dev/null
@@ -1,37 +0,0 @@
-""" Global Response Normalization Module
-
-Based on the GRN layer presented in
-`ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808
-
-This implementation
-* works for both NCHW and NHWC tensor layouts
-* uses affine param names matching existing torch norm layers
-* slightly improves eager mode performance via fused addcmul
-
-Hacked together by / Copyright 2023 Ross Wightman
-"""
-
-from mindnlp.core import nn, ops
-
-class GlobalResponseNorm(nn.Module):
-    """ Global Response Normalization layer
-    """
-    def __init__(self, dim, eps=1e-6, channels_last=True):
-        super().__init__()
-        self.eps = eps
-        if channels_last:
-            self.spatial_dim = (1, 2)
-            self.channel_dim = -1
-            self.wb_shape = (1, 1, 1, -1)
-        else:
-            self.spatial_dim = (2, 3)
-            self.channel_dim = 1
-            self.wb_shape = (1, -1, 1, 1)
-
-        self.weight = nn.Parameter(ops.zeros(dim))
-        self.bias = nn.Parameter(ops.zeros(dim))
-
-    def forward(self, x):
-        x_g = x.norm(p=2, dim=self.spatial_dim, keepdim=True)
-        x_n = x_g / (x_g.mean(dim=self.channel_dim, keepdim=True) + self.eps)
-        return x + ops.addcmul(self.bias.view(self.wb_shape), self.weight.view(self.wb_shape), x * x_n)
diff --git a/mindnlp/mimm/layers/halo_attn.py b/mindnlp/mimm/layers/halo_attn.py
deleted file mode 100644
index 81b93383e..000000000
--- a/mindnlp/mimm/layers/halo_attn.py
+++ /dev/null
@@ -1,226 +0,0 @@
-""" Halo Self Attention
-
-Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
-    - https://arxiv.org/abs/2103.12731
-
-@misc{2103.12731,
-Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
-    Jonathon Shlens},
-Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
-Year = {2021},
-}
-
-Status:
-This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
-The attention mechanism works but it's slow as implemented.
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from typing import List
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .helpers import make_divisible
-from .weight_init import trunc_normal_
-
-
-def rel_logits_1d(q, rel_k, permute_mask: List[int]):
-    """ Compute relative logits along one dimension
-
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    Args:
-        q: (batch, height, width, dim)
-        rel_k: (2 * window - 1, dim)
-        permute_mask: permute output dim according to this
-    """
-    B, H, W, dim = q.shape
-    rel_size = rel_k.shape[0]
-    win_size = (rel_size + 1) // 2
-
-    x = (q @ rel_k.transpose(-1, -2))
-    x = x.reshape(-1, W, rel_size)
-
-    # pad to shift from relative to absolute indexing
-    x_pad = F.pad(x, [0, 1]).flatten(1)
-    x_pad = F.pad(x_pad, [0, rel_size - W])
-
-    # reshape and slice out the padded elements
-    x_pad = x_pad.reshape(-1, W + 1, rel_size)
-    x = x_pad[:, :W, win_size - 1:]
-
-    # reshape and tile
-    x = x.reshape(B, H, 1, W, win_size).expand(-1, -1, win_size, -1, -1)
-    return x.permute(permute_mask)
-
-
-class PosEmbedRel(nn.Module):
-    """ Relative Position Embedding
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    """
-    def __init__(self, block_size, win_size, dim_head, scale):
-        """
-        Args:
-            block_size (int): block size
-            win_size (int): neighbourhood window size
-            dim_head (int): attention head dim
-            scale (float): scale factor (for init)
-        """
-        super().__init__()
-        self.block_size = block_size
-        self.dim_head = dim_head
-        self.height_rel = nn.Parameter(ops.randn(win_size * 2 - 1, dim_head) * scale)
-        self.width_rel = nn.Parameter(ops.randn(win_size * 2 - 1, dim_head) * scale)
-
-    def forward(self, q):
-        B, BB, HW, _ = q.shape
-
-        # relative logits in width dimension.
-        q = q.reshape(-1, self.block_size, self.block_size, self.dim_head)
-        rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4))
-
-        # relative logits in height dimension.
-        q = q.transpose(1, 2)
-        rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2))
-
-        rel_logits = rel_logits_h + rel_logits_w
-        rel_logits = rel_logits.reshape(B, BB, HW, -1)
-        return rel_logits
-
-
-class HaloAttn(nn.Module):
-    """ Halo Attention
-
-    Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
-        - https://arxiv.org/abs/2103.12731
-
-    The internal dimensions of the attention module are controlled by the interaction of several arguments.
-      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
-      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
-      * the query and key (qk) dimensions are determined by
-        * num_heads * dim_head if dim_head is not None
-        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
-      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used
-
-    Args:
-        dim (int): input dimension to the module
-        dim_out (int): output dimension of the module, same as dim if not set
-        feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
-        stride: output stride of the module, query downscaled if > 1 (default: 1).
-        num_heads: parallel attention heads (default: 8).
-        dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
-        block_size (int): size of blocks. (default: 8)
-        halo_size (int): size of halo overlap. (default: 3)
-        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
-        qkv_bias (bool) : add bias to q, k, and v projections
-        avg_down (bool): use average pool downsample instead of strided query blocks
-        scale_pos_embed (bool): scale the position embedding as well as Q @ K
-    """
-    def __init__(
-            self, dim, dim_out=None, feat_size=None, stride=1, num_heads=8, dim_head=None, block_size=8, halo_size=3,
-            qk_ratio=1.0, qkv_bias=False, avg_down=False, scale_pos_embed=False):
-        super().__init__()
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0
-        assert stride in (1, 2)
-        self.num_heads = num_heads
-        self.dim_head_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads
-        self.dim_head_v = dim_out // self.num_heads
-        self.dim_out_qk = num_heads * self.dim_head_qk
-        self.dim_out_v = num_heads * self.dim_head_v
-        self.scale = self.dim_head_qk ** -0.5
-        self.scale_pos_embed = scale_pos_embed
-        self.block_size = self.block_size_ds = block_size
-        self.halo_size = halo_size
-        self.win_size = block_size + halo_size * 2  # neighbourhood window size
-        self.block_stride = 1
-        use_avg_pool = False
-        if stride > 1:
-            use_avg_pool = avg_down or block_size % stride != 0
-            self.block_stride = 1 if use_avg_pool else stride
-            self.block_size_ds = self.block_size // self.block_stride
-
-        self.q = nn.Conv2d(dim, self.dim_out_qk, 1, stride=self.block_stride, bias=qkv_bias)
-        self.kv = nn.Conv2d(dim, self.dim_out_qk + self.dim_out_v, 1, bias=qkv_bias)
-
-        self.pos_embed = PosEmbedRel(
-            block_size=self.block_size_ds, win_size=self.win_size, dim_head=self.dim_head_qk, scale=self.scale)
-
-        self.pool = nn.AvgPool2d(2, 2) if use_avg_pool else nn.Identity()
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        std = self.q.weight.shape[1] ** -0.5  # fan-in
-        trunc_normal_(self.q.weight, std=std)
-        trunc_normal_(self.kv.weight, std=std)
-        trunc_normal_(self.pos_embed.height_rel, std=self.scale)
-        trunc_normal_(self.pos_embed.width_rel, std=self.scale)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert H % self.block_size == 0, ''
-        assert W % self.block_size == 0, ''
-        num_h_blocks = H // self.block_size
-        num_w_blocks = W // self.block_size
-        num_blocks = num_h_blocks * num_w_blocks
-
-        q = self.q(x)
-        # unfold
-        q = q.reshape(
-            -1, self.dim_head_qk,
-            num_h_blocks, self.block_size_ds, num_w_blocks, self.block_size_ds).permute(0, 1, 3, 5, 2, 4)
-        # B, num_heads * dim_head * block_size ** 2, num_blocks
-        q = q.reshape(B * self.num_heads, self.dim_head_qk, -1, num_blocks).transpose(1, 3)
-        # B * num_heads, num_blocks, block_size ** 2, dim_head
-
-        kv = self.kv(x)
-        # Generate overlapping windows for kv. This approach is good for GPU and CPU. However, unfold() is not
-        # lowered for PyTorch XLA so it will be very slow. See code at bottom of file for XLA friendly approach.
-        kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size])
-        kv = kv.unfold(2, self.win_size, self.block_size).unfold(3, self.win_size, self.block_size).reshape(
-            B * self.num_heads, self.dim_head_qk + self.dim_head_v, num_blocks, -1).permute(0, 2, 3, 1)
-        k, v = ops.split(kv, [self.dim_head_qk, self.dim_head_v], dim=-1)
-        # B * num_heads, num_blocks, win_size ** 2, dim_head_qk or dim_head_v
-
-        if self.scale_pos_embed:
-            attn = (q @ k.transpose(-1, -2) + self.pos_embed(q)) * self.scale
-        else:
-            attn = (q @ k.transpose(-1, -2)) * self.scale + self.pos_embed(q)
-        # B * num_heads, num_blocks, block_size ** 2, win_size ** 2
-        attn = attn.softmax(dim=-1)
-
-        out = (attn @ v).transpose(1, 3)  # B * num_heads, dim_head_v, block_size ** 2, num_blocks
-        # fold
-        out = out.reshape(-1, self.block_size_ds, self.block_size_ds, num_h_blocks, num_w_blocks)
-        out = out.permute(0, 3, 1, 4, 2).contiguous().view(
-            B, self.dim_out_v, H // self.block_stride, W // self.block_stride)
-        # B, dim_out, H // block_stride, W // block_stride
-        out = self.pool(out)
-        return out
-
-
-""" Three alternatives for overlapping windows.
-
-`.unfold().unfold()` is same speed as stride tricks with similar clarity as F.unfold()
-
-    if is_xla:
-        # This code achieves haloing on PyTorch XLA with reasonable runtime trade-off, it is
-        # EXTREMELY slow for backward on a GPU though so I need a way of selecting based on environment.
-        WW = self.win_size ** 2
-        pw = torch.eye(WW, dtype=x.dtype, device=x.device).reshape(WW, 1, self.win_size, self.win_size)
-        kv = F.conv2d(kv.reshape(-1, 1, H, W), pw, stride=self.block_size, padding=self.halo_size)
-    elif self.stride_tricks:
-        kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size]).contiguous()
-        kv = kv.as_strided((
-            B, self.dim_out_qk + self.dim_out_v, self.win_size, self.win_size, num_h_blocks, num_w_blocks),
-            stride=(kv.stride(0), kv.stride(1), kv.shape[-1], 1, self.block_size * kv.shape[-1], self.block_size))
-    else:
-        kv = F.unfold(kv, kernel_size=self.win_size, stride=self.block_size, padding=self.halo_size)
-
-    kv = kv.reshape(
-       B * self.num_heads, self.dim_head_qk + self.dim_head_v, -1, num_blocks).transpose(1, 3)
-"""
\ No newline at end of file
diff --git a/mindnlp/mimm/layers/helpers.py b/mindnlp/mimm/layers/helpers.py
deleted file mode 100644
index b003f48d8..000000000
--- a/mindnlp/mimm/layers/helpers.py
+++ /dev/null
@@ -1,43 +0,0 @@
-""" Layer/Module Helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from itertools import repeat
-import collections.abc
-
-
-# From PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
-            return tuple(x)
-        return tuple(repeat(x, n))
-    return parse
-
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-
-
-def make_divisible(v, divisor=8, min_value=None, round_limit=.9):
-    min_value = min_value or divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < round_limit * v:
-        new_v += divisor
-    return new_v
-
-
-def extend_tuple(x, n):
-    # pads a tuple to specified n by padding with last value
-    if not isinstance(x, (tuple, list)):
-        x = (x,)
-    else:
-        x = tuple(x)
-    pad_n = n - len(x)
-    if pad_n <= 0:
-        return x[:n]
-    return x + (x[-1],) * pad_n
diff --git a/mindnlp/mimm/layers/hybrid_embed.py b/mindnlp/mimm/layers/hybrid_embed.py
deleted file mode 100644
index 69a575ff5..000000000
--- a/mindnlp/mimm/layers/hybrid_embed.py
+++ /dev/null
@@ -1,251 +0,0 @@
-""" Image to Patch Hybird Embedding Layer
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import logging
-import math
-from typing import List, Optional, Tuple, Union
-
-import mindspore
-from mindnlp.core import nn, ops, no_grad
-from mindnlp.core.nn import functional as F
-
-from .format import Format, nchw_to
-from .helpers import to_2tuple
-from .patch_embed import resample_patch_embed
-
-
-_logger = logging.getLogger(__name__)
-
-
-class HybridEmbed(nn.Module):
-    """ CNN Feature Map Embedding
-    Extract feature map from CNN, flatten, project to embedding dim.
-    """
-    output_fmt: Format
-    dynamic_img_pad: bool
-
-    def __init__(
-            self,
-            backbone: nn.Module,
-            img_size: Union[int, Tuple[int, int]] = 224,
-            patch_size: Union[int, Tuple[int, int]] = 1,
-            feature_size: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_ratio: Optional[Union[int, Tuple[int, int]]] = None,
-            in_chans: int = 3,
-            embed_dim: int = 768,
-            bias: bool = True,
-            proj: bool = True,
-            flatten: bool = True,
-            output_fmt: Optional[str] = None,
-            strict_img_size: bool = True,
-            dynamic_img_pad: bool = False,
-    ):
-        super().__init__()
-        assert isinstance(backbone, nn.Module)
-        self.backbone = backbone
-        self.in_chans = in_chans
-        (
-            self.img_size,
-            self.patch_size,
-            self.feature_size,
-            self.feature_ratio,
-            self.feature_dim,
-            self.grid_size,
-            self.num_patches,
-        ) = self._init_backbone(
-            img_size=img_size,
-            patch_size=patch_size,
-            feature_size=feature_size,
-            feature_ratio=feature_ratio,
-        )
-
-        if output_fmt is not None:
-            self.flatten = False
-            self.output_fmt = Format(output_fmt)
-        else:
-            # flatten spatial dim and transpose to channels last, kept for bwd compat
-            self.flatten = flatten
-            self.output_fmt = Format.NCHW
-        self.strict_img_size = strict_img_size
-        self.dynamic_img_pad = dynamic_img_pad
-        if not dynamic_img_pad:
-            assert self.feature_size[0] % self.patch_size[0] == 0 and self.feature_size[1] % self.patch_size[1] == 0
-
-        if proj:
-            self.proj = nn.Conv2d(
-                self.feature_dim,
-                embed_dim,
-                kernel_size=patch_size,
-                stride=patch_size,
-                bias=bias,
-            )
-        else:
-            assert self.feature_dim == embed_dim, \
-                f'The feature dim ({self.feature_dim} must match embed dim ({embed_dim}) when projection disabled.'
-            self.proj = nn.Identity()
-
-    def _init_backbone(
-            self,
-            img_size: Union[int, Tuple[int, int]] = 224,
-            patch_size: Union[int, Tuple[int, int]] = 1,
-            feature_size: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_ratio: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_dim: Optional[int] = None,
-    ):
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        if feature_size is None:
-            with no_grad():
-                # NOTE Most reliable way of determining output dims is to run forward pass
-                training = self.backbone.training
-                if training:
-                    self.backbone.eval()
-                o = self.backbone(ops.zeros(1, self.in_chans, img_size[0], img_size[1]))
-                if isinstance(o, (list, tuple)):
-                    o = o[-1]  # last feature if backbone outputs list/tuple of features
-                feature_size = o.shape[-2:]
-                feature_dim = o.shape[1]
-                self.backbone.train(training)
-            feature_ratio = tuple([s // f for s, f in zip(img_size, feature_size)])
-        else:
-            feature_size = to_2tuple(feature_size)
-            feature_ratio = to_2tuple(feature_ratio or 16)
-            if feature_dim is None:
-                if hasattr(self.backbone, 'feature_info'):
-                    feature_dim = self.backbone.feature_info.channels()[-1]
-                else:
-                    feature_dim = self.backbone.num_features
-        grid_size = tuple([f // p for f, p in zip(feature_size, patch_size)])
-        num_patches = grid_size[0] * grid_size[1]
-        return img_size, patch_size, feature_size, feature_ratio, feature_dim, grid_size, num_patches
-
-    def set_input_size(
-            self,
-            img_size: Optional[Union[int, Tuple[int, int]]] = None,
-            patch_size: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_size: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_ratio: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_dim: Optional[int] = None,
-    ):
-        assert img_size is not None or patch_size is not None
-        img_size = img_size or self.img_size
-        new_patch_size = None
-        if patch_size is not None:
-            new_patch_size = to_2tuple(patch_size)
-        if new_patch_size is not None and new_patch_size != self.patch_size:
-            assert isinstance(self.proj, nn.Conv2d), 'HybridEmbed must have a projection layer to change patch size.'
-            with no_grad():
-                new_proj = nn.Conv2d(
-                    self.proj.in_channels,
-                    self.proj.out_channels,
-                    kernel_size=new_patch_size,
-                    stride=new_patch_size,
-                    bias=self.proj.bias is not None,
-                )
-                new_proj.weight.copy_(resample_patch_embed(self.proj.weight, new_patch_size, verbose=True))
-                if self.proj.bias is not None:
-                    new_proj.bias.copy_(self.proj.bias)
-                self.proj = new_proj
-            patch_size = new_patch_size
-        patch_size = patch_size or self.patch_size
-
-        if img_size != self.img_size or patch_size != self.patch_size:
-            (
-                self.img_size,
-                self.patch_size,
-                self.feature_size,
-                self.feature_ratio,
-                self.feature_dim,
-                self.grid_size,
-                self.num_patches,
-            ) = self._init_backbone(
-                img_size=img_size,
-                patch_size=patch_size,
-                feature_size=feature_size,
-                feature_ratio=feature_ratio,
-                feature_dim=feature_dim,
-            )
-
-    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
-        total_reduction = (
-            self.feature_ratio[0] * self.patch_size[0],
-            self.feature_ratio[1] * self.patch_size[1]
-        )
-        if as_scalar:
-            return max(total_reduction)
-        else:
-            return total_reduction
-
-    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
-        """ Get feature grid size taking account dynamic padding and backbone network feat reduction
-        """
-        feat_size = (img_size[0] // self.feature_ratio[0], img_size[1] // self.feature_ratio[1])
-        if self.dynamic_img_pad:
-            return math.ceil(feat_size[0] / self.patch_size[0]), math.ceil(feat_size[1] / self.patch_size[1])
-        else:
-            return feat_size[0] // self.patch_size[0], feat_size[1] // self.patch_size[1]
-
-    def set_grad_checkpointing(self, enable: bool = True):
-        if hasattr(self.backbone, 'set_grad_checkpointing'):
-            self.backbone.set_grad_checkpointing(enable=enable)
-        elif hasattr(self.backbone, 'grad_checkpointing'):
-            self.backbone.grad_checkpointing = enable
-
-    def forward(self, x):
-        x = self.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-        _, _, H, W = x.shape
-        if self.dynamic_img_pad:
-            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
-            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
-            x = F.pad(x, (0, pad_w, 0, pad_h))
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
-        elif self.output_fmt != Format.NCHW:
-            x = nchw_to(x, self.output_fmt)
-        return x
-
-
-class HybridEmbedWithSize(HybridEmbed):
-    """ CNN Feature Map Embedding
-    Extract feature map from CNN, flatten, project to embedding dim.
-    """
-    def __init__(
-            self,
-            backbone: nn.Module,
-            img_size: Union[int, Tuple[int, int]] = 224,
-            patch_size: Union[int, Tuple[int, int]] = 1,
-            feature_size: Optional[Union[int, Tuple[int, int]]] = None,
-            feature_ratio: Optional[Union[int, Tuple[int, int]]] = None,
-            in_chans: int = 3,
-            embed_dim: int = 768,
-            bias=True,
-            proj=True,
-    ):
-        super().__init__(
-            backbone=backbone,
-            img_size=img_size,
-            patch_size=patch_size,
-            feature_size=feature_size,
-            feature_ratio=feature_ratio,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            bias=bias,
-            proj=proj,
-        )
-
-    def set_grad_checkpointing(self, enable: bool = True):
-        if hasattr(self.backbone, 'set_grad_checkpointing'):
-            self.backbone.set_grad_checkpointing(enable=enable)
-        elif hasattr(self.backbone, 'grad_checkpointing'):
-            self.backbone.grad_checkpointing = enable
-
-    def forward(self, x) -> Tuple[mindspore.Tensor, List[int]]:
-        x = self.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-        x = self.proj(x)
-        return x.flatten(2).transpose(1, 2), x.shape[-2:]
diff --git a/mindnlp/mimm/layers/inplace_abn.py b/mindnlp/mimm/layers/inplace_abn.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/mimm/layers/interpolate.py b/mindnlp/mimm/layers/interpolate.py
deleted file mode 100644
index a4b024d93..000000000
--- a/mindnlp/mimm/layers/interpolate.py
+++ /dev/null
@@ -1,69 +0,0 @@
-""" Interpolation helpers for timm layers
-
-RegularGridInterpolator from https://github.com/sbarratt/torch_interpolations
-Copyright Shane Barratt, Apache 2.0 license
-"""
-from itertools import product
-import mindspore
-from mindnlp.core import ops
-
-
-class RegularGridInterpolator:
-    """ Interpolate data defined on a rectilinear grid with even or uneven spacing.
-    Produces similar results to scipy RegularGridInterpolator or interp2d
-    in 'linear' mode.
-
-    Taken from https://github.com/sbarratt/torch_interpolations
-    """
-
-    def __init__(self, points, values):
-        self.points = points
-        self.values = values
-
-        assert isinstance(self.points, (list, tuple))
-        assert isinstance(self.values, mindspore.Tensor)
-
-        self.ms = list(self.values.shape)
-        self.n = len(self.points)
-
-        assert len(self.ms) == self.n
-
-        for i, p in enumerate(self.points):
-            assert isinstance(p, mindspore.Tensor)
-            assert p.shape[0] == self.values.shape[i]
-
-    def __call__(self, points_to_interp):
-        assert self.points is not None
-        assert self.values is not None
-
-        assert len(points_to_interp) == len(self.points)
-        K = points_to_interp[0].shape[0]
-        for x in points_to_interp:
-            assert x.shape[0] == K
-
-        idxs = []
-        dists = []
-        overalls = []
-        for p, x in zip(self.points, points_to_interp):
-            idx_right = ops.bucketize(x, p)
-            idx_right[idx_right >= p.shape[0]] = p.shape[0] - 1
-            idx_left = (idx_right - 1).clamp(0, p.shape[0] - 1)
-            dist_left = x - p[idx_left]
-            dist_right = p[idx_right] - x
-            dist_left[dist_left < 0] = 0.
-            dist_right[dist_right < 0] = 0.
-            both_zero = (dist_left == 0) & (dist_right == 0)
-            dist_left[both_zero] = dist_right[both_zero] = 1.
-
-            idxs.append((idx_left, idx_right))
-            dists.append((dist_left, dist_right))
-            overalls.append(dist_left + dist_right)
-
-        numerator = 0.
-        for indexer in product([0, 1], repeat=self.n):
-            as_s = [idx[onoff] for onoff, idx in zip(indexer, idxs)]
-            bs_s = [dist[1 - onoff] for onoff, dist in zip(indexer, dists)]
-            numerator += self.values[as_s] * \
-                ops.prod(ops.stack(bs_s), dim=0)
-        denominator = ops.prod(ops.stack(overalls), dim=0)
-        return numerator / denominator
diff --git a/mindnlp/mimm/layers/lambda_layer.py b/mindnlp/mimm/layers/lambda_layer.py
deleted file mode 100644
index 2fb6ac161..000000000
--- a/mindnlp/mimm/layers/lambda_layer.py
+++ /dev/null
@@ -1,129 +0,0 @@
-""" Lambda Layer
-
-Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention`
-    - https://arxiv.org/abs/2102.08602
-
-@misc{2102.08602,
-Author = {Irwan Bello},
-Title = {LambdaNetworks: Modeling Long-Range Interactions Without Attention},
-Year = {2021},
-}
-
-Status:
-This impl is a WIP. Code snippets in the paper were used as reference but
-good chance some details are missing/wrong.
-
-I've only implemented local lambda conv based pos embeddings.
-
-"""
-# pylint: disable=unsubscriptable-object
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .grid import ndgrid
-from .helpers import to_2tuple, make_divisible
-from .weight_init import trunc_normal_
-
-
-def rel_pos_indices(size):
-    size = to_2tuple(size)
-    pos = ops.stack(ndgrid(ops.arange(size[0]), ops.arange(size[1]))).flatten(1)
-    rel_pos = pos[:, None, :] - pos[:, :, None]
-    rel_pos[0] += size[0] - 1
-    rel_pos[1] += size[1] - 1
-    return rel_pos  # 2, H * W, H * W
-
-
-class LambdaLayer(nn.Module):
-    """Lambda Layer
-
-    Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention`
-        - https://arxiv.org/abs/2102.08602
-
-    NOTE: intra-depth parameter 'u' is fixed at 1. It did not appear worth the complexity to add.
-
-    The internal dimensions of the lambda module are controlled via the interaction of several arguments.
-      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
-      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
-      * the query (q) and key (k) dimension are determined by
-        * dim_head = (dim_out * attn_ratio // num_heads) if dim_head is None
-        * q = num_heads * dim_head, k = dim_head
-      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not set
-
-    Args:
-        dim (int): input dimension to the module
-        dim_out (int): output dimension of the module, same as dim if not set
-        feat_size (Tuple[int, int]): size of input feature_map for relative pos variant H, W
-        stride (int): output stride of the module, avg pool used if stride == 2
-        num_heads (int): parallel attention heads.
-        dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
-        r (int): local lambda convolution radius. Use lambda conv if set, else relative pos if not. (default: 9)
-        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
-        qkv_bias (bool): add bias to q, k, and v projections
-    """
-    def __init__(
-            self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, dim_head=16, r=9,
-            qk_ratio=1.0, qkv_bias=False):
-        super().__init__()
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0, ' should be divided by num_heads'
-        self.dim_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads
-        self.num_heads = num_heads
-        self.dim_v = dim_out // num_heads
-
-        self.qkv = nn.Conv2d(
-            dim,
-            num_heads * self.dim_qk + self.dim_qk + self.dim_v,
-            kernel_size=1, bias=qkv_bias)
-        self.norm_q = nn.BatchNorm2d(num_heads * self.dim_qk)
-        self.norm_v = nn.BatchNorm2d(self.dim_v)
-
-        if r is not None:
-            # local lambda convolution for pos
-            self.conv_lambda = nn.Conv3d(1, self.dim_qk, (r, r, 1), padding=(r // 2, r // 2, 0))
-            self.pos_emb = None
-            self.rel_pos_indices = None
-        else:
-            # relative pos embedding
-            assert feat_size is not None
-            feat_size = to_2tuple(feat_size)
-            rel_size = [2 * s - 1 for s in feat_size]
-            self.conv_lambda = None
-            self.pos_emb = nn.Parameter(ops.zeros(rel_size[0], rel_size[1], self.dim_qk))
-            self.register_buffer('rel_pos_indices', rel_pos_indices(feat_size), persistent=False)
-
-        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5)  # fan-in
-        if self.conv_lambda is not None:
-            trunc_normal_(self.conv_lambda.weight, std=self.dim_qk ** -0.5)
-        if self.pos_emb is not None:
-            trunc_normal_(self.pos_emb, std=.02)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        M = H * W
-        qkv = self.qkv(x)
-        q, k, v = ops.split(qkv, [
-            self.num_heads * self.dim_qk, self.dim_qk, self.dim_v], dim=1)
-        q = self.norm_q(q).reshape(B, self.num_heads, self.dim_qk, M).transpose(-1, -2)  # B, num_heads, M, K
-        v = self.norm_v(v).reshape(B, self.dim_v, M).transpose(-1, -2)  # B, M, V
-        k = F.softmax(k.reshape(B, self.dim_qk, M), dim=-1)  # B, K, M
-
-        content_lam = k @ v  # B, K, V
-        content_out = q @ content_lam.unsqueeze(1)  # B, num_heads, M, V
-
-        if self.pos_emb is None:
-            position_lam = self.conv_lambda(v.reshape(B, 1, H, W, self.dim_v))  # B, H, W, V, K
-            position_lam = position_lam.reshape(B, 1, self.dim_qk, H * W, self.dim_v).transpose(2, 3)  # B, 1, M, K, V
-        else:
-            pos_emb = self.pos_emb[self.rel_pos_indices[0], self.rel_pos_indices[1]].expand(B, -1, -1, -1)
-            position_lam = (pos_emb.transpose(-1, -2) @ v.unsqueeze(1)).unsqueeze(1)  # B, 1, M, K, V
-        position_out = (q.unsqueeze(-2) @ position_lam).squeeze(-2)  # B, num_heads, M, V
-
-        out = (content_out + position_out).transpose(-1, -2).reshape(B, C, H, W)  # B, C (num_heads * V), H, W
-        out = self.pool(out)
-        return out
diff --git a/mindnlp/mimm/layers/layer_scale.py b/mindnlp/mimm/layers/layer_scale.py
deleted file mode 100644
index 8c41a6790..000000000
--- a/mindnlp/mimm/layers/layer_scale.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""layer scale"""
-import mindspore
-from mindnlp.core import nn, ops
-
-class LayerScale(nn.Module):
-    """ LayerScale on tensors with channels in last-dim.
-    """
-    def __init__(
-            self,
-            dim: int,
-            init_values: float = 1e-5,
-            inplace: bool = False,
-    ) -> None:
-        super().__init__()
-        self.inplace = inplace
-        self.gamma = nn.Parameter(init_values * ops.ones(dim))
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        return x.mul_(self.gamma) if self.inplace else x * self.gamma
-
-
-class LayerScale2d(nn.Module):
-    """ LayerScale for tensors with torch 2D NCHW layout.
-    """
-    def __init__(
-            self,
-            dim: int,
-            init_values: float = 1e-5,
-            inplace: bool = False,
-    ):
-        super().__init__()
-        self.inplace = inplace
-        self.gamma = nn.Parameter(init_values * ops.ones(dim))
-
-    def forward(self, x):
-        gamma = self.gamma.view(1, -1, 1, 1)
-        return x.mul_(gamma) if self.inplace else x * gamma
diff --git a/mindnlp/mimm/layers/mixed_conv2d.py b/mindnlp/mimm/layers/mixed_conv2d.py
deleted file mode 100644
index 28df13299..000000000
--- a/mindnlp/mimm/layers/mixed_conv2d.py
+++ /dev/null
@@ -1,50 +0,0 @@
-""" MindSpore Mixed Convolution
-
-Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from mindnlp.core import nn, ops
-
-from .conv2d_same import create_conv2d_pad
-
-
-def _split_channels(num_chan, num_groups):
-    split = [num_chan // num_groups for _ in range(num_groups)]
-    split[0] += num_chan - sum(split)
-    return split
-
-
-class MixedConv2d(nn.ModuleDict):
-    """ Mixed Grouped Convolution
-
-    Based on MDConv and GroupedConv in MixNet impl:
-      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
-        super(MixedConv2d, self).__init__()
-
-        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
-        num_groups = len(kernel_size)
-        in_splits = _split_channels(in_channels, num_groups)
-        out_splits = _split_channels(out_channels, num_groups)
-        self.in_channels = sum(in_splits)
-        self.out_channels = sum(out_splits)
-        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            conv_groups = in_ch if depthwise else 1
-            # use add_module to keep key space clean
-            self.add_module(
-                str(idx),
-                create_conv2d_pad(
-                    in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
-            )
-        self.splits = in_splits
-
-    def forward(self, x):
-        x_split = ops.split(x, self.splits, 1)
-        x_out = [c(x_split[i]) for i, c in enumerate(self.values())]
-        x = ops.cat(x_out, 1)
-        return x
diff --git a/mindnlp/mimm/layers/mlp.py b/mindnlp/mimm/layers/mlp.py
deleted file mode 100644
index 11f24acd9..000000000
--- a/mindnlp/mimm/layers/mlp.py
+++ /dev/null
@@ -1,260 +0,0 @@
-""" MLP module w/ dropout and configurable activation layer
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from functools import partial
-
-from mindnlp.core import nn
-
-from .grn import GlobalResponseNorm
-from .helpers import to_2tuple
-
-
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            use_conv=False,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
-
-        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
-        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class GluMlp(nn.Module):
-    """ MLP w/ GLU style gating
-    See: https://arxiv.org/abs/1612.08083, https://arxiv.org/abs/2002.05202
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.Sigmoid,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            use_conv=False,
-            gate_last=True,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        assert hidden_features % 2 == 0
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
-        self.chunk_dim = 1 if use_conv else -1
-        self.gate_last = gate_last  # use second half of width for gate
-
-        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = norm_layer(hidden_features // 2) if norm_layer is not None else nn.Identity()
-        self.fc2 = linear_layer(hidden_features // 2, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def init_weights(self):
-        # override init of fc1 w/ gate portion set to weight near zero, bias=1
-        fc1_mid = self.fc1.bias.shape[0] // 2
-        nn.init.ones_(self.fc1.bias[fc1_mid:])
-        nn.init.normal_(self.fc1.weight[fc1_mid:], std=1e-6)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x1, x2 = x.chunk(2, dim=self.chunk_dim)
-        x = x1 * self.act(x2) if self.gate_last else self.act(x1) * x2
-        x = self.drop1(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-SwiGLUPacked = partial(GluMlp, act_layer=nn.SiLU, gate_last=False)
-
-
-class SwiGLU(nn.Module):
-    """ SwiGLU
-    NOTE: GluMLP above can implement SwiGLU, but this impl has split fc1 and
-    better matches some other common impl which makes mapping checkpoints simpler.
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.SiLU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-
-        self.fc1_g = nn.Linear(in_features, hidden_features, bias=bias[0])
-        self.fc1_x = nn.Linear(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def init_weights(self):
-        # override init of fc1 w/ gate portion set to weight near zero, bias=1
-        nn.init.ones_(self.fc1_g.bias)
-        nn.init.normal_(self.fc1_g.weight, std=1e-6)
-
-    def forward(self, x):
-        x_gate = self.fc1_g(x)
-        x = self.fc1_x(x)
-        x = self.act(x_gate) * x
-        x = self.drop1(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class GatedMlp(nn.Module):
-    """ MLP as used in gMLP
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            norm_layer=None,
-            gate_layer=None,
-            bias=True,
-            drop=0.,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        if gate_layer is not None:
-            assert hidden_features % 2 == 0
-            self.gate = gate_layer(hidden_features)
-            hidden_features = hidden_features // 2
-        else:
-            self.gate = nn.Identity()
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.gate(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class ConvMlp(nn.Module):
-    """ MLP using 1x1 convs that keeps spatial dims
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.ReLU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-
-        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0])
-        self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity()
-        self.act = act_layer()
-        self.drop = nn.Dropout(drop)
-        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.norm(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        return x
-
-
-class GlobalResponseNormMlp(nn.Module):
-    """ MLP w/ Global Response Norm (see grn.py), nn.Linear or 1x1 Conv2d
-    """
-    def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            bias=True,
-            drop=0.,
-            use_conv=False,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
-
-        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.grn = GlobalResponseNorm(hidden_features, channels_last=not use_conv)
-        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.grn(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
diff --git a/mindnlp/mimm/layers/non_local_attn.py b/mindnlp/mimm/layers/non_local_attn.py
deleted file mode 100644
index 32c22b4fe..000000000
--- a/mindnlp/mimm/layers/non_local_attn.py
+++ /dev/null
@@ -1,143 +0,0 @@
-""" Bilinear-Attention-Transform and Non-Local Attention
-
-Paper: `Non-Local Neural Networks With Grouped Bilinear Attentional Transforms`
-    - https://openaccess.thecvf.com/content_CVPR_2020/html/Chi_Non-Local_Neural_Networks_With_Grouped_Bilinear_Attentional_Transforms_CVPR_2020_paper.html
-Adapted from original code: https://github.com/BA-Transform/BAT-Image-Classification
-"""
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .conv_bn_act import ConvNormAct
-from .helpers import make_divisible
-
-
-class NonLocalAttn(nn.Module):
-    """Spatial NL block for image classification.
-
-    This was adapted from https://github.com/BA-Transform/BAT-Image-Classification
-    Their NonLocal impl inspired by https://github.com/facebookresearch/video-nonlocal-net.
-    """
-
-    def __init__(self, in_channels, use_scale=True,  rd_ratio=1/8, rd_channels=None, rd_divisor=8, **kwargs):
-        super(NonLocalAttn, self).__init__()
-        if rd_channels is None:
-            rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor)
-        self.scale = in_channels ** -0.5 if use_scale else 1.0
-        self.t = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True)
-        self.p = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True)
-        self.g = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True)
-        self.z = nn.Conv2d(rd_channels, in_channels, kernel_size=1, stride=1, bias=True)
-        self.norm = nn.BatchNorm2d(in_channels)
-        self.reset_parameters()
-
-    def forward(self, x):
-        shortcut = x
-
-        t = self.t(x)
-        p = self.p(x)
-        g = self.g(x)
-
-        B, C, H, W = t.size()
-        t = t.view(B, C, -1).permute(0, 2, 1)
-        p = p.view(B, C, -1)
-        g = g.view(B, C, -1).permute(0, 2, 1)
-
-        att = ops.bmm(t, p) * self.scale
-        att = F.softmax(att, dim=2)
-        x = ops.bmm(att, g)
-
-        x = x.permute(0, 2, 1).reshape(B, C, H, W)
-        x = self.z(x)
-        x = self.norm(x) + shortcut
-
-        return x
-
-    def reset_parameters(self):
-        for name, m in self.named_modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(
-                    m.weight, mode='fan_out', nonlinearity='relu')
-                if len(list(m.parameters())) > 1:
-                    nn.init.constant_(m.bias, 0.0)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 0)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.GroupNorm):
-                nn.init.constant_(m.weight, 0)
-                nn.init.constant_(m.bias, 0)
-
-
-class BilinearAttnTransform(nn.Module):
-
-    def __init__(self, in_channels, block_size, groups, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
-        super(BilinearAttnTransform, self).__init__()
-
-        self.conv1 = ConvNormAct(in_channels, groups, 1, act_layer=act_layer, norm_layer=norm_layer)
-        self.conv_p = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(block_size, 1))
-        self.conv_q = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(1, block_size))
-        self.conv2 = ConvNormAct(in_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
-        self.block_size = block_size
-        self.groups = groups
-        self.in_channels = in_channels
-
-    def resize_mat(self, x, t: int):
-        B, C, block_size, block_size1 = x.shape
-        assert block_size == block_size1, ''
-        if t <= 1:
-            return x
-        x = x.view(B * C, -1, 1, 1)
-        x = x * ops.eye(t, t, dtype=x.dtype)
-        x = x.view(B * C, block_size, block_size, t, t)
-        x = ops.cat(ops.split(x, 1, dim=1), dim=3)
-        x = ops.cat(ops.split(x, 1, dim=2), dim=4)
-        x = x.view(B, C, block_size * t, block_size * t)
-        return x
-
-    def forward(self, x):
-        assert x.shape[-1] % self.block_size == 0, ''
-        assert x.shape[-2] % self.block_size == 0, ''
-        B, C, H, W = x.shape
-        out = self.conv1(x)
-        rp = F.adaptive_max_pool2d(out, (self.block_size, 1))
-        cp = F.adaptive_max_pool2d(out, (1, self.block_size))
-        p = self.conv_p(rp).view(B, self.groups, self.block_size, self.block_size).sigmoid()
-        q = self.conv_q(cp).view(B, self.groups, self.block_size, self.block_size).sigmoid()
-        p = p / p.sum(dim=3, keepdim=True)
-        q = q / q.sum(dim=2, keepdim=True)
-        p = p.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size(
-            0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous()
-        p = p.view(B, C, self.block_size, self.block_size)
-        q = q.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size(
-            0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous()
-        q = q.view(B, C, self.block_size, self.block_size)
-        p = self.resize_mat(p, H // self.block_size)
-        q = self.resize_mat(q, W // self.block_size)
-        y = p.matmul(x)
-        y = y.matmul(q)
-
-        y = self.conv2(y)
-        return y
-
-
-class BatNonLocalAttn(nn.Module):
-    """ BAT
-    Adapted from: https://github.com/BA-Transform/BAT-Image-Classification
-    """
-
-    def __init__(
-            self, in_channels, block_size=7, groups=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8,
-            drop_rate=0.2, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, **_):
-        super().__init__()
-        if rd_channels is None:
-            rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor)
-        self.conv1 = ConvNormAct(in_channels, rd_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
-        self.ba = BilinearAttnTransform(rd_channels, block_size, groups, act_layer=act_layer, norm_layer=norm_layer)
-        self.conv2 = ConvNormAct(rd_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
-        self.dropout = nn.Dropout2d(p=drop_rate)
-
-    def forward(self, x):
-        xl = self.conv1(x)
-        y = self.ba(xl)
-        y = self.conv2(y)
-        y = self.dropout(y)
-        return y + x
diff --git a/mindnlp/mimm/layers/norm.py b/mindnlp/mimm/layers/norm.py
deleted file mode 100644
index 0e16459c2..000000000
--- a/mindnlp/mimm/layers/norm.py
+++ /dev/null
@@ -1,171 +0,0 @@
-""" Normalization layers and wrappers
-
-Norm layer definitions that support fast norm and consistent channel arg order (always first arg).
-
-Hacked together by / Copyright 2022 Ross Wightman
-"""
-import numbers
-from typing import Tuple
-
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-
-class GroupNorm(nn.GroupNorm):
-    def __init__(self, num_channels, num_groups=32, eps=1e-5, affine=True):
-        # NOTE num_channels is swapped to first arg for consistency in swapping norm layers with BN
-        super().__init__(num_groups, num_channels, eps=eps, affine=affine)
-
-    def forward(self, x):
-        return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-
-
-class GroupNorm1(nn.GroupNorm):
-    """ Group Normalization with 1 group.
-    Input: tensor in shape [B, C, *]
-    """
-
-    def __init__(self, num_channels, **kwargs):
-        super().__init__(1, num_channels, **kwargs)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-
-
-class LayerNorm(nn.LayerNorm):
-    """ LayerNorm w/ fast norm option
-    """
-    def __init__(self, num_channels, eps=1e-6, affine=True):
-        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        return x
-
-
-class LayerNorm2d(nn.LayerNorm):
-    """ LayerNorm for channels of '2D' spatial NCHW tensors """
-    def __init__(self, num_channels, eps=1e-6, affine=True):
-        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        x = x.permute(0, 2, 3, 1)
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        x = x.permute(0, 3, 1, 2)
-        return x
-
-
-def _is_contiguous(tensor: mindspore.Tensor) -> bool:
-    # jit is oh so lovely :/
-    return tensor.is_contiguous()
-
-
-def _layer_norm_cf(x: mindspore.Tensor, weight: mindspore.Tensor, bias: mindspore.Tensor, eps: float):
-    s, u = ops.var_mean(x, dim=1, correction=0, keepdim=True)
-    x = (x - u) * ops.rsqrt(s + eps)
-    x = x * weight[:, None, None] + bias[:, None, None]
-    return x
-
-
-def _layer_norm_cf_sqm(x: mindspore.Tensor, weight: mindspore.Tensor, bias: mindspore.Tensor, eps: float):
-    u = x.mean(dim=1, keepdim=True)
-    s = ((x * x).mean(dim=1, keepdim=True) - (u * u)).clamp(0)
-    x = (x - u) * ops.rsqrt(s + eps)
-    x = x * weight.view(1, -1, 1, 1) + bias.view(1, -1, 1, 1)
-    return x
-
-
-class LayerNormExp2d(nn.LayerNorm):
-    """ LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).
-
-    Experimental implementation w/ manual norm for tensors non-contiguous tensors.
-
-    This improves throughput in some scenarios (tested on Ampere GPU), esp w/ channels_last
-    layout. However, benefits are not always clear and can perform worse on other GPUs.
-    """
-
-    def __init__(self, num_channels, eps=1e-6):
-        super().__init__(num_channels, eps=eps)
-
-    def forward(self, x) -> mindspore.Tensor:
-        if _is_contiguous(x):
-            x = F.layer_norm(
-                x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2)
-        else:
-            x = _layer_norm_cf(x, self.weight, self.bias, self.eps)
-        return x
-
-
-class RmsNorm(nn.Module):
-    """ RmsNorm w/ fast (apex) norm if available
-    """
-    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
-    normalized_shape: Tuple[int, ...]
-    eps: float
-    elementwise_affine: bool
-
-    def __init__(self, channels, eps=1e-6, affine=True, dtype=None) -> None:
-        factory_kwargs = {'dtype': dtype}
-        super().__init__()
-        normalized_shape = channels
-        if isinstance(normalized_shape, numbers.Integral):
-            # mypy error: incompatible types in assignment
-            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
-        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
-        self.eps = eps
-        self.elementwise_affine = affine
-        if self.elementwise_affine:
-            self.weight = nn.Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
-        else:
-            self.register_parameter('weight', None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        if self.elementwise_affine:
-            nn.init.ones_(self.weight)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        # NOTE fast norm fallback needs our rms norm impl, so both paths through here.
-        # Since there is no built-in PyTorch impl, always use APEX RmsNorm if is installed.
-        x = F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
-        return x
-
-
-class RmsNorm2d(nn.Module):
-    """ RmsNorm w/ fast (apex) norm if available
-    """
-    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
-    normalized_shape: Tuple[int, ...]
-    eps: float
-    elementwise_affine: bool
-
-    def __init__(self, channels, eps=1e-6, affine=True, dtype=None) -> None:
-        factory_kwargs = {'dtype': dtype}
-        super().__init__()
-        normalized_shape = channels
-        if isinstance(normalized_shape, numbers.Integral):
-            # mypy error: incompatible types in assignment
-            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
-        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
-        self.eps = eps
-        self.elementwise_affine = affine
-        if self.elementwise_affine:
-            self.weight = nn.Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
-        else:
-            self.register_parameter('weight', None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        if self.elementwise_affine:
-            nn.init.ones_(self.weight)
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        x = x.permute(0, 2, 3, 1)
-        # NOTE fast norm fallback needs our rms norm impl, so both paths through here.
-        # Since there is no built-in PyTorch impl, always use APEX RmsNorm if is installed.
-        x = F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
-        x = x.permute(0, 3, 1, 2)
-        return x
diff --git a/mindnlp/mimm/layers/norm_act.py b/mindnlp/mimm/layers/norm_act.py
deleted file mode 100644
index 911479ebd..000000000
--- a/mindnlp/mimm/layers/norm_act.py
+++ /dev/null
@@ -1,440 +0,0 @@
-""" Normalization + Activation Layers
-
-Provides Norm+Act fns for standard PyTorch norm layers such as
-* BatchNorm
-* GroupNorm
-* LayerNorm
-
-This allows swapping with alternative layers that are natively both norm + act such as
-* EvoNorm (evo_norm.py)
-* FilterResponseNorm (filter_response_norm.py)
-* InplaceABN (inplace_abn.py)
-
-Hacked together by / Copyright 2022 Ross Wightman
-"""
-from typing import Union, List, Tuple
-
-import mindspore
-from mindnlp.core import nn, ops, no_grad
-from mindnlp.core.nn import functional as F
-# from torchvision.ops.misc import FrozenBatchNorm2d
-
-from .create_act import create_act_layer
-
-
-def _create_act(act_layer, act_kwargs=None, inplace=False, apply_act=True):
-    act_kwargs = act_kwargs or {}
-    act_kwargs.setdefault('inplace', inplace)
-    act = None
-    if apply_act:
-        act = create_act_layer(act_layer, **act_kwargs)
-    return nn.Identity() if act is None else act
-
-
-class BatchNormAct2d(nn.BatchNorm2d):
-    """BatchNorm + Activation
-
-    This module performs BatchNorm + Activation in a manner that will remain backwards
-    compatible with weights trained with separate bn, act. This is why we inherit from BN
-    instead of composing it as a .bn member.
-    """
-    def __init__(
-            self,
-            num_features,
-            eps=1e-5,
-            momentum=0.1,
-            affine=True,
-            track_running_stats=True,
-            apply_act=True,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            inplace=True,
-            drop_layer=None,
-            dtype=None,
-    ):
-        try:
-            factory_kwargs = {'dtype': dtype}
-            super(BatchNormAct2d, self).__init__(
-                num_features,
-                eps=eps,
-                momentum=momentum,
-                affine=affine,
-                track_running_stats=track_running_stats,
-                **factory_kwargs,
-            )
-        except TypeError:
-            # NOTE for backwards compat with old PyTorch w/o factory device/dtype support
-            super(BatchNormAct2d, self).__init__(
-                num_features,
-                eps=eps,
-                momentum=momentum,
-                affine=affine,
-                track_running_stats=track_running_stats,
-            )
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-    def forward(self, x):
-        # cut & paste of nn.BatchNorm2d.forward impl to avoid issues with torchscript and tracing
-        assert x.ndim == 4, f'expected 4D input (got {x.ndim}D input)'
-
-        # exponential_average_factor is set to self.momentum
-        # (when it is available) only so that it gets updated
-        # in ONNX graph when this node is exported to ONNX.
-        if self.momentum is None:
-            exponential_average_factor = 0.0
-        else:
-            exponential_average_factor = self.momentum
-
-        if self.training and self.track_running_stats:
-            # TODO: if statement only here to tell the jit to skip emitting this when it is None
-            if self.num_batches_tracked is not None:  # type: ignore[has-type]
-                self.num_batches_tracked.add_(1)  # type: ignore[has-type]
-                if self.momentum is None:  # use cumulative moving average
-                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
-                else:  # use exponential moving average
-                    exponential_average_factor = self.momentum
-
-        r"""
-        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
-        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
-        """
-        if self.training:
-            bn_training = True
-        else:
-            bn_training = (self.running_mean is None) and (self.running_var is None)
-
-        r"""
-        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
-        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
-        used for normalization (i.e. in eval mode when buffers are not None).
-        """
-        x = F.batch_norm(
-            x,
-            # If buffers are not to be tracked, ensure that they won't be updated
-            self.running_mean if not self.training or self.track_running_stats else None,
-            self.running_var if not self.training or self.track_running_stats else None,
-            self.weight,
-            self.bias,
-            bn_training,
-            exponential_average_factor,
-            self.eps,
-        )
-        x = self.drop(x)
-        x = self.act(x)
-        return x
-
-
-class SyncBatchNormAct(nn.SyncBatchNorm):
-    # Thanks to Selim Seferbekov (https://github.com/rwightman/pytorch-image-models/issues/1254)
-    # This is a quick workaround to support SyncBatchNorm for timm BatchNormAct2d layers
-    # but ONLY when used in conjunction with the timm conversion function below.
-    # Do not create this module directly or use the PyTorch conversion function.
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        x = super().forward(x)  # SyncBN doesn't work with torchscript anyways, so this is fine
-        if hasattr(self, "drop"):
-            x = self.drop(x)
-        if hasattr(self, "act"):
-            x = self.act(x)
-        return x
-
-
-def convert_sync_batchnorm(module, process_group=None):
-    # convert both BatchNorm and BatchNormAct layers to Synchronized variants
-    module_output = module
-    if isinstance(module, nn.modules.batchnorm._BatchNorm):
-        if isinstance(module, BatchNormAct2d):
-            # convert timm norm + act layer
-            module_output = SyncBatchNormAct(
-                module.num_features,
-                module.eps,
-                module.momentum,
-                module.affine,
-                module.track_running_stats,
-                process_group=process_group,
-            )
-            # set act and drop attr from the original module
-            module_output.act = module.act
-            module_output.drop = module.drop
-        else:
-            # convert standard BatchNorm layers
-            module_output = nn.SyncBatchNorm(
-                module.num_features,
-                module.eps,
-                module.momentum,
-                module.affine,
-                module.track_running_stats,
-                process_group,
-            )
-        if module.affine:
-            with no_grad():
-                module_output.weight = module.weight
-                module_output.bias = module.bias
-        module_output.running_mean = module.running_mean
-        module_output.running_var = module.running_var
-        module_output.num_batches_tracked = module.num_batches_tracked
-        if hasattr(module, "qconfig"):
-            module_output.qconfig = module.qconfig
-    for name, child in module.named_children():
-        module_output.add_module(name, convert_sync_batchnorm(child, process_group))
-    del module
-    return module_output
-
-
-class FrozenBatchNormAct2d(nn.Module):
-    """
-    BatchNormAct2d where the batch statistics and the affine parameters are fixed
-
-    Args:
-        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
-        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
-    """
-
-    def __init__(
-        self,
-        num_features: int,
-        eps: float = 1e-5,
-        apply_act=True,
-        act_layer=nn.ReLU,
-        act_kwargs=None,
-        inplace=True,
-        drop_layer=None,
-    ):
-        super().__init__()
-        self.eps = eps
-        self.register_buffer("weight", ops.ones(num_features))
-        self.register_buffer("bias", ops.zeros(num_features))
-        self.register_buffer("running_mean", ops.zeros(num_features))
-        self.register_buffer("running_var", ops.ones(num_features))
-
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-    def _load_from_state_dict(
-        self,
-        state_dict: dict,
-        prefix: str,
-        local_metadata: dict,
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ):
-        num_batches_tracked_key = prefix + "num_batches_tracked"
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-        )
-
-    def forward(self, x: mindspore.Tensor) -> mindspore.Tensor:
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        scale = w * (rv + self.eps).rsqrt()
-        bias = b - rm * scale
-        x = x * scale + bias
-        x = self.act(self.drop(x))
-        return x
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps}, act={self.act})"
-
-
-def freeze_batch_norm_2d(module):
-    """
-    Converts all `BatchNorm2d` and `SyncBatchNorm` or `BatchNormAct2d` and `SyncBatchNormAct2d` layers
-    of provided module into `FrozenBatchNorm2d` or `FrozenBatchNormAct2d` respectively.
-
-    Args:
-        module (nn.Module): Any PyTorch module.
-
-    Returns:
-        nn.Module: Resulting module
-
-    Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
-    """
-    res = module
-    if isinstance(module, (BatchNormAct2d, SyncBatchNormAct)):
-        res = FrozenBatchNormAct2d(module.num_features)
-        res.num_features = module.num_features
-        res.affine = module.affine
-        if module.affine:
-            res.weight.data = module.weight.data.clone().detach()
-            res.bias.data = module.bias.data.clone().detach()
-        res.running_mean.data = module.running_mean.data
-        res.running_var.data = module.running_var.data
-        res.eps = module.eps
-        res.drop = module.drop
-        res.act = module.act
-    # elif isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
-    #     res = FrozenBatchNorm2d(module.num_features)
-    #     res.num_features = module.num_features
-    #     res.affine = module.affine
-    #     if module.affine:
-    #         res.weight.data = module.weight.data.clone().detach()
-    #         res.bias.data = module.bias.data.clone().detach()
-    #     res.running_mean.data = module.running_mean.data
-    #     res.running_var.data = module.running_var.data
-    #     res.eps = module.eps
-    else:
-        for name, child in module.named_children():
-            new_child = freeze_batch_norm_2d(child)
-            if new_child is not child:
-                res.add_module(name, new_child)
-    return res
-
-
-def unfreeze_batch_norm_2d(module):
-    """
-    Converts all `FrozenBatchNorm2d` layers of provided module into `BatchNorm2d`. If `module` is itself and instance
-    of `FrozenBatchNorm2d`, it is converted into `BatchNorm2d` and returned. Otherwise, the module is walked
-    recursively and submodules are converted in place.
-
-    Args:
-        module (nn.Module): Any PyTorch module.
-
-    Returns:
-        nn.Module: Resulting module
-
-    Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
-    """
-    res = module
-    if isinstance(module, FrozenBatchNormAct2d):
-        res = BatchNormAct2d(module.num_features)
-        if module.affine:
-            res.weight.data = module.weight.data.clone().detach()
-            res.bias.data = module.bias.data.clone().detach()
-        res.running_mean.data = module.running_mean.data
-        res.running_var.data = module.running_var.data
-        res.eps = module.eps
-        res.drop = module.drop
-        res.act = module.act
-    # elif isinstance(module, FrozenBatchNorm2d):
-    #     res = nn.BatchNorm2d(module.num_features)
-    #     if module.affine:
-    #         res.weight.data = module.weight.data.clone().detach()
-    #         res.bias.data = module.bias.data.clone().detach()
-    #     res.running_mean.data = module.running_mean.data
-    #     res.running_var.data = module.running_var.data
-    #     res.eps = module.eps
-    else:
-        for name, child in module.named_children():
-            new_child = unfreeze_batch_norm_2d(child)
-            if new_child is not child:
-                res.add_module(name, new_child)
-    return res
-
-
-def _num_groups(num_channels, num_groups, group_size):
-    if group_size:
-        assert num_channels % group_size == 0
-        return num_channels // group_size
-    return num_groups
-
-
-class GroupNormAct(nn.GroupNorm):
-    # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
-    def __init__(
-            self,
-            num_channels,
-            num_groups=32,
-            eps=1e-5,
-            affine=True,
-            group_size=None,
-            apply_act=True,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            inplace=True,
-            drop_layer=None,
-    ):
-        super(GroupNormAct, self).__init__(
-            _num_groups(num_channels, num_groups, group_size),
-            num_channels,
-            eps=eps,
-            affine=affine,
-        )
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-    def forward(self, x):
-        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-        x = self.drop(x)
-        x = self.act(x)
-        return x
-
-
-class GroupNorm1Act(nn.GroupNorm):
-    def __init__(
-            self,
-            num_channels,
-            eps=1e-5,
-            affine=True,
-            apply_act=True,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            inplace=True,
-            drop_layer=None,
-    ):
-        super(GroupNorm1Act, self).__init__(1, num_channels, eps=eps, affine=affine)
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-
-    def forward(self, x):
-        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-        x = self.drop(x)
-        x = self.act(x)
-        return x
-
-
-class LayerNormAct(nn.LayerNorm):
-    def __init__(
-            self,
-            normalization_shape: Union[int, List[int], Tuple[int]],
-            eps=1e-5,
-            affine=True,
-            apply_act=True,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            inplace=True,
-            drop_layer=None,
-    ):
-        super(LayerNormAct, self).__init__(normalization_shape, eps=eps, elementwise_affine=affine)
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-    def forward(self, x):
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        x = self.drop(x)
-        x = self.act(x)
-        return x
-
-
-class LayerNormAct2d(nn.LayerNorm):
-    def __init__(
-            self,
-            num_channels,
-            eps=1e-5,
-            affine=True,
-            apply_act=True,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            inplace=True,
-            drop_layer=None,
-    ):
-        super(LayerNormAct2d, self).__init__(num_channels, eps=eps, elementwise_affine=affine)
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act = _create_act(act_layer, act_kwargs=act_kwargs, inplace=inplace, apply_act=apply_act)
-
-    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        x = x.permute(0, 3, 1, 2)
-        x = self.drop(x)
-        x = self.act(x)
-        return x
diff --git a/mindnlp/mimm/layers/padding.py b/mindnlp/mimm/layers/padding.py
deleted file mode 100644
index 075bc0514..000000000
--- a/mindnlp/mimm/layers/padding.py
+++ /dev/null
@@ -1,88 +0,0 @@
-""" Padding Helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import math
-from typing import List, Tuple, Union
-
-import mindspore
-from mindnlp.core import ops
-from mindnlp.core.nn import functional as F
-
-from .helpers import to_2tuple
-
-
-# Calculate symmetric padding for a convolution
-def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> Union[int, List[int]]:
-    if any([isinstance(v, (tuple, list)) for v in [kernel_size, stride, dilation]]):
-        kernel_size, stride, dilation = to_2tuple(kernel_size), to_2tuple(stride), to_2tuple(dilation)
-        return [get_padding(*a) for a in zip(kernel_size, stride, dilation)]
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
-def get_same_padding(x: int, kernel_size: int, stride: int, dilation: int):
-    if isinstance(x, mindspore.Tensor):
-        return ops.clamp(((x / stride).ceil() - 1) * stride + (kernel_size - 1) * dilation + 1 - x, min=0)
-    else:
-        return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
-
-
-# Can SAME padding for given args be done statically?
-def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
-    if any([isinstance(v, (tuple, list)) for v in [kernel_size, stride, dilation]]):
-        kernel_size, stride, dilation = to_2tuple(kernel_size), to_2tuple(stride), to_2tuple(dilation)
-        return all([is_static_pad(*a) for a in zip(kernel_size, stride, dilation)])
-    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
-
-
-def pad_same_arg(
-        input_size: List[int],
-        kernel_size: List[int],
-        stride: List[int],
-        dilation: List[int] = (1, 1),
-) -> List[int]:
-    ih, iw = input_size
-    kh, kw = kernel_size
-    pad_h = get_same_padding(ih, kh, stride[0], dilation[0])
-    pad_w = get_same_padding(iw, kw, stride[1], dilation[1])
-    return [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
-
-
-# Dynamically pad input x with 'SAME' padding for conv with specified args
-def pad_same(
-        x,
-        kernel_size: List[int],
-        stride: List[int],
-        dilation: List[int] = (1, 1),
-        value: float = 0,
-):
-    ih, iw = x.size()[-2:]
-    pad_h = get_same_padding(ih, kernel_size[0], stride[0], dilation[0])
-    pad_w = get_same_padding(iw, kernel_size[1], stride[1], dilation[1])
-    x = F.pad(x, (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2), value=value)
-    return x
-
-
-def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
-    dynamic = False
-    if isinstance(padding, str):
-        # for any string padding, the padding will be calculated for you, one of three ways
-        padding = padding.lower()
-        if padding == 'same':
-            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
-            if is_static_pad(kernel_size, **kwargs):
-                # static case, no extra overhead
-                padding = get_padding(kernel_size, **kwargs)
-            else:
-                # dynamic 'SAME' padding, has runtime/GPU memory overhead
-                padding = 0
-                dynamic = True
-        elif padding == 'valid':
-            # 'VALID' padding, same as padding=0
-            padding = 0
-        else:
-            # Default to PyTorch style 'same'-ish symmetric padding
-            padding = get_padding(kernel_size, **kwargs)
-    return padding, dynamic
diff --git a/mindnlp/mimm/layers/patch_dropout.py b/mindnlp/mimm/layers/patch_dropout.py
deleted file mode 100644
index 22687190f..000000000
--- a/mindnlp/mimm/layers/patch_dropout.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""patch dropout"""
-from typing import Optional, Tuple, Union
-
-import mindspore
-from mindnlp.core import nn, ops
-
-class PatchDropout(nn.Module):
-    """
-    https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220
-    """
-    return_indices: bool
-
-    def __init__(
-            self,
-            prob: float = 0.5,
-            num_prefix_tokens: int = 1,
-            ordered: bool = False,
-            return_indices: bool = False,
-    ):
-        super().__init__()
-        assert 0 <= prob < 1.
-        self.prob = prob
-        self.num_prefix_tokens = num_prefix_tokens  # exclude CLS token (or other prefix tokens)
-        self.ordered = ordered
-        self.return_indices = return_indices
-
-    def forward(self, x) -> Union[mindspore.Tensor, Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]]:
-        if not self.training or self.prob == 0.:
-            if self.return_indices:
-                return x, None
-            return x
-
-        if self.num_prefix_tokens:
-            prefix_tokens, x = x[:, :self.num_prefix_tokens], x[:, self.num_prefix_tokens:]
-        else:
-            prefix_tokens = None
-
-        B = x.shape[0]
-        L = x.shape[1]
-        num_keep = max(1, int(L * (1. - self.prob)))
-        keep_indices = ops.argsort(ops.randn(B, L), dim=-1)[:, :num_keep]
-        if self.ordered:
-            # NOTE does not need to maintain patch order in typical transformer use,
-            # but possibly useful for debug / visualization
-            keep_indices = keep_indices.sort(dim=-1)[0]
-        x = x.gather(1, keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:]))
-
-        if prefix_tokens is not None:
-            x = ops.cat((prefix_tokens, x), dim=1)
-
-        if self.return_indices:
-            return x, keep_indices
-        return x
diff --git a/mindnlp/mimm/layers/patch_embed.py b/mindnlp/mimm/layers/patch_embed.py
deleted file mode 100644
index 87b53ee43..000000000
--- a/mindnlp/mimm/layers/patch_embed.py
+++ /dev/null
@@ -1,239 +0,0 @@
-""" Image to Patch Embedding using Conv2d
-
-A convolution based approach to patchifying a 2D image w/ embedding projection.
-
-Based on code in:
-  * https://github.com/google-research/vision_transformer
-  * https://github.com/google-research/big_vision/tree/main/big_vision
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import logging
-import math
-from typing import Callable, List, Optional, Tuple, Union
-
-import mindspore
-from mindnlp.core import nn, no_grad
-from mindnlp.core.nn import functional as F
-
-from .format import Format, nchw_to
-from .helpers import to_2tuple
-
-_logger = logging.getLogger(__name__)
-
-
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-    output_fmt: Format
-    dynamic_img_pad: bool
-
-    def __init__(
-            self,
-            img_size: Optional[int] = 224,
-            patch_size: int = 16,
-            in_chans: int = 3,
-            embed_dim: int = 768,
-            norm_layer: Optional[Callable] = None,
-            flatten: bool = True,
-            output_fmt: Optional[str] = None,
-            bias: bool = True,
-            strict_img_size: bool = True,
-            dynamic_img_pad: bool = False,
-    ):
-        super().__init__()
-        self.patch_size = to_2tuple(patch_size)
-        self.img_size, self.grid_size, self.num_patches = self._init_img_size(img_size)
-
-        if output_fmt is not None:
-            self.flatten = False
-            self.output_fmt = Format(output_fmt)
-        else:
-            # flatten spatial dim and transpose to channels last, kept for bwd compat
-            self.flatten = flatten
-            self.output_fmt = Format.NCHW
-        self.strict_img_size = strict_img_size
-        self.dynamic_img_pad = dynamic_img_pad
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def _init_img_size(self, img_size: Union[int, Tuple[int, int]]):
-        assert self.patch_size
-        if img_size is None:
-            return None, None, None
-        img_size = to_2tuple(img_size)
-        grid_size = tuple(s // p for s, p in zip(img_size, self.patch_size))
-        num_patches = grid_size[0] * grid_size[1]
-        return img_size, grid_size, num_patches
-
-    def set_input_size(
-            self,
-            img_size: Optional[Union[int, Tuple[int, int]]] = None,
-            patch_size: Optional[Union[int, Tuple[int, int]]] = None,
-    ):
-        new_patch_size = None
-        if patch_size is not None:
-            new_patch_size = to_2tuple(patch_size)
-        if new_patch_size is not None and new_patch_size != self.patch_size:
-            with no_grad():
-                new_proj = nn.Conv2d(
-                    self.proj.in_channels,
-                    self.proj.out_channels,
-                    kernel_size=new_patch_size,
-                    stride=new_patch_size,
-                    bias=self.proj.bias is not None,
-                )
-                new_proj.weight.copy_(resample_patch_embed(self.proj.weight, new_patch_size, verbose=True))
-                if self.proj.bias is not None:
-                    new_proj.bias.copy_(self.proj.bias)
-                self.proj = new_proj
-            self.patch_size = new_patch_size
-        img_size = img_size or self.img_size
-        if img_size != self.img_size or new_patch_size is not None:
-            self.img_size, self.grid_size, self.num_patches = self._init_img_size(img_size)
-
-    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
-        if as_scalar:
-            return max(self.patch_size)
-        else:
-            return self.patch_size
-
-    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
-        """ Get grid (feature) size for given image size taking account of dynamic padding.
-        NOTE: must be torchscript compatible so using fixed tuple indexing
-        """
-        if self.dynamic_img_pad:
-            return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(img_size[1] / self.patch_size[1])
-        else:
-            return img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1]
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        if self.img_size is not None:
-            if self.strict_img_size:
-                assert H == self.img_size[0], f"Input height ({H}) doesn't match model ({self.img_size[0]})."
-                assert W == self.img_size[1], f"Input width ({W}) doesn't match model ({self.img_size[1]})."
-            elif not self.dynamic_img_pad:
-                assert H % self.patch_size[0] == 0, f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
-                assert W % self.patch_size[1] == 0, f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
-        if self.dynamic_img_pad:
-            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
-            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
-            x = F.pad(x, (0, pad_w, 0, pad_h))
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
-        elif self.output_fmt != Format.NCHW:
-            x = nchw_to(x, self.output_fmt)
-        x = self.norm(x)
-        return x
-
-
-class PatchEmbedWithSize(PatchEmbed):
-    """ 2D Image to Patch Embedding
-    """
-    output_fmt: Format
-
-    def __init__(
-            self,
-            img_size: Optional[int] = 224,
-            patch_size: int = 16,
-            in_chans: int = 3,
-            embed_dim: int = 768,
-            norm_layer: Optional[Callable] = None,
-            flatten: bool = True,
-            output_fmt: Optional[str] = None,
-            bias: bool = True,
-    ):
-        super().__init__(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer,
-            flatten=flatten,
-            output_fmt=output_fmt,
-            bias=bias,
-        )
-
-    def forward(self, x) -> Tuple[mindspore.Tensor, List[int]]:
-        B, C, H, W = x.shape
-        if self.img_size is not None:
-            assert H % self.patch_size[0] == 0, f"Input image height ({H}) must be divisible by patch size ({self.patch_size[0]})."
-            assert W % self.patch_size[1] == 0, f"Input image width ({W}) must be divisible by patch size ({self.patch_size[1]})."
-
-        x = self.proj(x)
-        feat_size = x.shape[-2:]
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
-        elif self.output_fmt != Format.NCHW:
-            x = nchw_to(x, self.output_fmt)
-        x = self.norm(x)
-        return x, feat_size
-
-
-def resample_patch_embed(
-        patch_embed,
-        new_size: List[int],
-        interpolation: str = 'bicubic',
-        antialias: bool = True,
-        verbose: bool = False,
-):
-    """Resample the weights of the patch embedding kernel to target resolution.
-    We resample the patch embedding kernel by approximately inverting the effect
-    of patch resizing.
-
-    Code based on:
-      https://github.com/google-research/big_vision/blob/b00544b81f8694488d5f36295aeb7972f3755ffe/big_vision/models/proj/flexi/vit.py
-
-    With this resizing, we can for example load a B/8 filter into a B/16 model
-    and, on 2x larger input image, the result will match.
-
-    Args:
-        patch_embed: original parameter to be resized.
-        new_size (tuple(int, int): target shape (height, width)-only.
-        interpolation (str): interpolation for resize
-        antialias (bool): use anti-aliasing filter in resize
-        verbose (bool): log operation
-    Returns:
-        Resized patch embedding kernel.
-    """
-    import numpy as np
-
-    assert len(patch_embed.shape) == 4, "Four dimensions expected"
-    assert len(new_size) == 2, "New shape should only be hw"
-    old_size = patch_embed.shape[-2:]
-    if tuple(old_size) == tuple(new_size):
-        return patch_embed
-
-    if verbose:
-        _logger.info(f"Resize patch embedding {patch_embed.shape} to {new_size}, w/ {interpolation} interpolation.")
-
-    def resize(x_np, _new_size):
-        x_tf = mindspore.Tensor(x_np)[None, None, ...]
-        x_upsampled = F.interpolate(
-            x_tf, size=_new_size, mode=interpolation, antialias=antialias)[0, 0, ...].numpy()
-        return x_upsampled
-
-    def get_resize_mat(_old_size, _new_size):
-        mat = []
-        for i in range(np.prod(_old_size)):
-            basis_vec = np.zeros(_old_size)
-            basis_vec[np.unravel_index(i, _old_size)] = 1.
-            mat.append(resize(basis_vec, _new_size).reshape(-1))
-        return np.stack(mat).T
-
-    resize_mat = get_resize_mat(old_size, new_size)
-    resize_mat_pinv = mindspore.tensor(np.linalg.pinv(resize_mat.T))
-
-    def resample_kernel(kernel):
-        resampled_kernel = resize_mat_pinv @ kernel.reshape(-1)
-        return resampled_kernel.reshape(new_size)
-
-    v_resample_kernel = mindspore.vmap(mindspore.vmap(resample_kernel, 0, 0), 1, 1)
-    orig_dtype = patch_embed.dtype
-    patch_embed = patch_embed.float()
-    patch_embed = v_resample_kernel(patch_embed)
-    patch_embed = patch_embed.to(orig_dtype)
-    return patch_embed
diff --git a/mindnlp/mimm/layers/pool2d_same.py b/mindnlp/mimm/layers/pool2d_same.py
deleted file mode 100644
index 250221d24..000000000
--- a/mindnlp/mimm/layers/pool2d_same.py
+++ /dev/null
@@ -1,72 +0,0 @@
-""" AvgPool2d w/ Same Padding
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from typing import List
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .helpers import to_2tuple
-from .padding import pad_same, get_padding_value
-
-
-def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
-                    ceil_mode: bool = False, count_include_pad: bool = True):
-    # FIXME how to deal with count_include_pad vs not for external padding?
-    x = pad_same(x, kernel_size, stride)
-    return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
-
-
-class AvgPool2dSame(nn.AvgPool2d):
-    """ Tensorflow like 'SAME' wrapper for 2D average pooling
-    """
-    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
-
-    def forward(self, x):
-        x = pad_same(x, self.kernel_size, self.stride)
-        return F.avg_pool2d(
-            x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
-
-
-def max_pool2d_same(
-        x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
-        dilation: List[int] = (1, 1), ceil_mode: bool = False):
-    x = pad_same(x, kernel_size, stride, value=-float('inf'))
-    return F.max_pool2d(x, kernel_size, stride, (0, 0), dilation, ceil_mode)
-
-
-class MaxPool2dSame(nn.MaxPool2d):
-    """ Tensorflow like 'SAME' wrapper for 2D max pooling
-    """
-    def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False):
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
-        super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode)
-
-    def forward(self, x):
-        x = pad_same(x, self.kernel_size, self.stride, value=-float('inf'))
-        return F.max_pool2d(x, self.kernel_size, self.stride, (0, 0), self.dilation, self.ceil_mode)
-
-
-def create_pool2d(pool_type, kernel_size, stride=None, **kwargs):
-    stride = stride or kernel_size
-    padding = kwargs.pop('padding', '')
-    padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs)
-    if is_dynamic:
-        if pool_type == 'avg':
-            return AvgPool2dSame(kernel_size, stride=stride, **kwargs)
-        elif pool_type == 'max':
-            return MaxPool2dSame(kernel_size, stride=stride, **kwargs)
-        else:
-            assert False, f'Unsupported pool type {pool_type}'
-    else:
-        if pool_type == 'avg':
-            return nn.AvgPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
-        elif pool_type == 'max':
-            return nn.MaxPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
-        else:
-            assert False, f'Unsupported pool type {pool_type}'
diff --git a/mindnlp/mimm/layers/pos_embed.py b/mindnlp/mimm/layers/pos_embed.py
deleted file mode 100644
index 60120c30f..000000000
--- a/mindnlp/mimm/layers/pos_embed.py
+++ /dev/null
@@ -1,74 +0,0 @@
-""" Position Embedding Utilities
-
-Hacked together by / Copyright 2022 Ross Wightman
-"""
-import logging
-import math
-from typing import List, Optional
-
-import mindspore
-from mindnlp.core import ops
-from mindnlp.core.nn import functional as F
-
-
-_logger = logging.getLogger(__name__)
-
-
-def resample_abs_pos_embed(
-        posemb: mindspore.Tensor,
-        new_size: List[int],
-        old_size: Optional[List[int]] = None,
-        num_prefix_tokens: int = 1,
-        interpolation: str = 'bicubic',
-        antialias: bool = True,
-        verbose: bool = False,
-):
-    # sort out sizes, assume square if old size not provided
-    num_pos_tokens = posemb.shape[1]
-    num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens
-    if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]:
-        return posemb
-
-    if old_size is None:
-        hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens))
-        old_size = hw, hw
-
-    if num_prefix_tokens:
-        posemb_prefix, posemb = posemb[:, :num_prefix_tokens], posemb[:, num_prefix_tokens:]
-    else:
-        posemb_prefix = None
-
-    # do the interpolation
-    embed_dim = posemb.shape[-1]
-    orig_dtype = posemb.dtype
-    posemb = posemb.float()  # interpolate needs float32
-    posemb = posemb.reshape(1, old_size[0], old_size[1], -1).permute(0, 3, 1, 2)
-    posemb = F.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias)
-    posemb = posemb.permute(0, 2, 3, 1).reshape(1, -1, embed_dim)
-    posemb = posemb.to(orig_dtype)
-
-    # add back extra (class, etc) prefix tokens
-    if posemb_prefix is not None:
-        posemb = ops.cat([posemb_prefix, posemb], dim=1)
-
-
-    return posemb
-
-
-def resample_abs_pos_embed_nhwc(
-        posemb: mindspore.Tensor,
-        new_size: List[int],
-        interpolation: str = 'bicubic',
-        antialias: bool = True,
-        verbose: bool = False,
-):
-    if new_size[0] == posemb.shape[-3] and new_size[1] == posemb.shape[-2]:
-        return posemb
-
-    orig_dtype = posemb.dtype
-    posemb = posemb.float()
-    posemb = posemb.reshape(1, posemb.shape[-3], posemb.shape[-2], posemb.shape[-1]).permute(0, 3, 1, 2)
-    posemb = F.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias)
-    posemb = posemb.permute(0, 2, 3, 1).to(orig_dtype)
-
-    return posemb
diff --git a/mindnlp/mimm/layers/pos_embed_rel.py b/mindnlp/mimm/layers/pos_embed_rel.py
deleted file mode 100644
index 7cd132558..000000000
--- a/mindnlp/mimm/layers/pos_embed_rel.py
+++ /dev/null
@@ -1,491 +0,0 @@
-""" Relative position embedding modules and functions
-
-Hacked together by / Copyright 2022 Ross Wightman
-"""
-import math
-import os
-from typing import Optional, Tuple
-
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-from .grid import ndgrid
-
-from .interpolate import RegularGridInterpolator
-from .mlp import Mlp
-from .weight_init import trunc_normal_
-
-_USE_SCIPY = int(os.environ.get('TIMM_USE_SCIPY_INTERP', 0)) > 0
-
-
-def gen_relative_position_index(
-        q_size: Tuple[int, int],
-        k_size: Optional[Tuple[int, int]] = None,
-        class_token: bool = False,
-) -> mindspore.Tensor:
-    # Adapted with significant modifications from Swin / BeiT codebases
-    # get pair-wise relative position index for each token inside the window
-    assert k_size is None, 'Different q & k sizes not currently supported'  # FIXME
-
-    coords = ops.stack(ndgrid(ops.arange(q_size[0]), ops.arange(q_size[1]))).flatten(1)  # 2, Wh, Ww
-    relative_coords = coords[:, :, None] - coords[:, None, :]  # 2, Wh*Ww, Wh*Ww
-    relative_coords = relative_coords.permute(1, 2, 0)  # Qh*Qw, Kh*Kw, 2
-    relative_coords[:, :, 0] += q_size[0] - 1  # shift to start from 0
-    relative_coords[:, :, 1] += q_size[1] - 1
-    relative_coords[:, :, 0] *= 2 * q_size[1] - 1
-    num_relative_distance = (2 * q_size[0] - 1) * (2 * q_size[1] - 1)
-
-    # else:
-    #     # FIXME different q vs k sizes is a WIP, need to better offset the two grids?
-    #     q_coords = ops.stack(
-    #         ndgrid(
-    #             ops.arange(q_size[0]),
-    #             ops.arange(q_size[1])
-    #         )
-    #     ).flatten(1)  # 2, Wh, Ww
-    #     k_coords = ops.stack(
-    #         ndgrid(
-    #             ops.arange(k_size[0]),
-    #             ops.arange(k_size[1])
-    #         )
-    #     ).flatten(1)
-    #     relative_coords = q_coords[:, :, None] - k_coords[:, None, :]  # 2, Wh*Ww, Wh*Ww
-    #     relative_coords = relative_coords.permute(1, 2, 0)  # Qh*Qw, Kh*Kw, 2
-    #     relative_coords[:, :, 0] += max(q_size[0], k_size[0]) - 1  # shift to start from 0
-    #     relative_coords[:, :, 1] += max(q_size[1], k_size[1]) - 1
-    #     relative_coords[:, :, 0] *= k_size[1] + q_size[1] - 1
-    #     relative_position_index = relative_coords.sum(-1)  # Qh*Qw, Kh*Kw
-    #     num_relative_distance = (q_size[0] + k_size[0] - 1) * (q_size[1] + k_size[1] - 1) + 3
-
-    relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-
-    if class_token:
-        # handle cls to token & token 2 cls & cls to cls as per beit for rel pos bias
-        # NOTE not intended or tested with MLP log-coords
-        relative_position_index = F.pad(relative_position_index, [1, 0, 1, 0])
-        relative_position_index[0, 0:] = num_relative_distance
-        relative_position_index[0:, 0] = num_relative_distance + 1
-        relative_position_index[0, 0] = num_relative_distance + 2
-
-    return relative_position_index.contiguous()
-
-
-def resize_rel_pos_bias_table_simple(
-        rel_pos_bias,
-        new_window_size: Tuple[int, int],
-        new_bias_shape: Tuple[int, ...],
-):
-    dst_size = (new_window_size[0] * 2 - 1, new_window_size[1] * 2 - 1)
-    if rel_pos_bias.ndim == 3:
-        # TF maxvit style (num_heads, H, W) bias shape, no extra tokens currently supported
-        _, dst_h, dst_w = new_bias_shape
-        num_attn_heads, src_h, src_w = rel_pos_bias.shape
-        assert dst_h == dst_size[0] and dst_w == dst_size[1]
-        if src_h != dst_h or src_w != dst_w:
-            rel_pos_bias = nn.functional.interpolate(
-                rel_pos_bias.unsqueeze(0),
-                size=dst_size,
-                mode="bicubic",
-                align_corners=False,
-            ).squeeze(0)
-    else:
-        assert rel_pos_bias.ndim == 2
-        # (num_pos, num_heads) (aka flat) bias shape
-        dst_num_pos, _ = new_bias_shape
-        src_num_pos, num_attn_heads = rel_pos_bias.shape
-        num_extra_tokens = dst_num_pos - (dst_size[0] * dst_size[1])
-        src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
-        src_size = (src_size, src_size)  # FIXME could support non-equal src if argument passed
-
-        if src_size[0] != dst_size[0] or src_size[1] != dst_size[1]:
-            if num_extra_tokens:
-                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
-                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
-            else:
-                extra_tokens = None
-
-            rel_pos_bias = nn.functional.interpolate(
-                rel_pos_bias.transpose(1, 0).reshape((1, -1, src_size[0], src_size[1])),
-                size=dst_size,
-                mode="bicubic",
-                align_corners=False,
-            ).view(-1, dst_num_pos - num_extra_tokens).transpose(0, 1)
-
-            if extra_tokens is not None:
-                rel_pos_bias = ops.cat((rel_pos_bias, extra_tokens), dim=0)
-
-    return rel_pos_bias
-
-
-def resize_rel_pos_bias_table_levit(
-        position_bias_table,
-        new_size,
-        interpolation: str = 'bicubic',
-        antialias: bool = True,
-):
-    """
-    Resample relative position bias table suggested in LeVit
-    Adapted from: https://github.com/microsoft/Cream/blob/main/TinyViT/utils.py
-    """
-    L1, nH1 = position_bias_table.size()
-    L2, nH2 = new_size
-    assert nH1 == nH2
-    if L1 != L2:
-        orig_dtype = position_bias_table.dtype
-        position_bias_table = position_bias_table.float()
-        # bicubic interpolate relative_position_bias_table if not match
-        S1 = int(L1 ** 0.5)
-        S2 = int(L2 ** 0.5)
-        relative_position_bias_table_resized = F.interpolate(
-            position_bias_table.permute(1, 0).view(1, nH1, S1, S1),
-            size=(S2, S2),
-            mode=interpolation,
-            antialias=antialias)
-        relative_position_bias_table_resized = \
-            relative_position_bias_table_resized.view(nH2, L2).permute(1, 0)
-        relative_position_bias_table_resized.to(orig_dtype)
-        return relative_position_bias_table_resized
-    else:
-        return position_bias_table
-
-
-def resize_rel_pos_bias_table(
-        rel_pos_bias,
-        new_window_size: Tuple[int, int],
-        new_bias_shape: Tuple[int, ...],
-):
-    """ Resize relative position bias table using more advanced interpolation.
-
-    Modified from code in Microsoft Unilm (https://github.com/microsoft/unilm) repo (BeiT, BeiT-v2, etc).
-
-    https://github.com/microsoft/unilm/blob/5255d52de86dad642810f5849dd357769346c1d7/beit/run_class_finetuning.py#L351
-
-    Args:
-        rel_pos_bias:
-        new_window_size:
-        new_bias_shape:
-
-    Returns:
-
-    """
-    if _USE_SCIPY:
-        from scipy import interpolate
-
-    dst_size = (new_window_size[0] * 2 - 1, new_window_size[1] * 2 - 1)
-    if rel_pos_bias.ndim == 3:
-        # TF maxvit style (num_heads, H, W) bias shape, no extra tokens currently supported
-        num_extra_tokens = 0
-        _, dst_h, dst_w = new_bias_shape
-        assert dst_h == dst_size[0] and dst_w == dst_size[1]
-        num_attn_heads, src_h, src_w = rel_pos_bias.shape
-        src_size = (src_h, src_w)
-        has_flat_shape = False
-    else:
-        assert rel_pos_bias.ndim == 2
-        # (num_pos, num_heads) (aka flat) bias shape
-        dst_num_pos, _ = new_bias_shape
-        src_num_pos, num_attn_heads = rel_pos_bias.shape
-        num_extra_tokens = dst_num_pos - (dst_size[0] * dst_size[1])
-        src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
-        src_size = (src_size, src_size)
-        has_flat_shape = True
-
-    if src_size[0] != dst_size[0] or src_size[1] != dst_size[1]:
-        # print("Interpolating position from %dx%d to %dx%d" % (src_size[0], src_size[1], dst_size[0], dst_size[1]))
-        if num_extra_tokens:
-            extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
-            rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
-        else:
-            extra_tokens = None
-
-        def geometric_progression(a, r, n):
-            return a * (1.0 - r ** n) / (1.0 - r)
-
-        def _calc(src, dst):
-            left, right = 1.01, 1.5
-            while right - left > 1e-6:
-                q = (left + right) / 2.0
-                gp = geometric_progression(1, q, src // 2)
-                if gp > dst // 2:
-                    right = q
-                else:
-                    left = q
-
-            dis = []
-            cur = 1
-            for i in range(src // 2):
-                dis.append(cur)
-                cur += q ** (i + 1)
-            r_ids = [-_ for _ in reversed(dis)]
-            return r_ids + [0] + dis
-
-        y = _calc(src_size[0], dst_size[0])
-        x = _calc(src_size[1], dst_size[1])
-        yx = [mindspore.tensor(y), mindspore.tensor(x)]
-        # print("Original positions = %s" % str(x))
-
-        ty = dst_size[0] // 2.0
-        tx = dst_size[1] // 2.0
-        dy = ops.arange(-ty, ty + 0.1, 1.0)
-        dx = ops.arange(-tx, tx + 0.1, 1.0)
-        dyx = ndgrid(dy, dx)
-        # print("Target positions = %s" % str(dx))
-
-        all_rel_pos_bias = []
-        for i in range(num_attn_heads):
-            if has_flat_shape:
-                z = rel_pos_bias[:, i].view(src_size[0], src_size[1]).float()
-            else:
-                z = rel_pos_bias[i, :, :].float()
-
-            if _USE_SCIPY:
-                # Original beit code uses scipy w/ cubic interpolation
-                f = interpolate.interp2d(x, y, z.numpy(), kind='cubic')
-                r = mindspore.Tensor(f(dx, dy)).contiguous().to(rel_pos_bias.device)
-            else:
-                # Without scipy dependency, I've found a reasonably simple impl
-                # that supports uneven spaced interpolation pts with 'linear' interp.
-                # Results are comparable to scipy for model accuracy in most cases.
-                f = RegularGridInterpolator(yx, z)
-                r = f(dyx).contiguous().to(rel_pos_bias.device)
-
-            if has_flat_shape:
-                r = r.view(-1, 1)
-            all_rel_pos_bias.append(r)
-
-        if has_flat_shape:
-            rel_pos_bias = ops.cat(all_rel_pos_bias, dim=-1)
-        else:
-            rel_pos_bias = ops.cat(all_rel_pos_bias, dim=0)
-
-        if extra_tokens is not None:
-            assert has_flat_shape
-            rel_pos_bias = ops.cat((rel_pos_bias, extra_tokens), dim=0)
-
-    return rel_pos_bias
-
-
-class RelPosBias(nn.Module):
-    """ Relative Position Bias
-    Adapted from Swin-V1 relative position bias impl, modularized.
-    """
-
-    def __init__(self, window_size, num_heads, prefix_tokens=0):
-        super().__init__()
-        assert prefix_tokens <= 1
-        self.window_size = window_size
-        self.window_area = window_size[0] * window_size[1]
-        self.bias_shape = (self.window_area + prefix_tokens,) * 2 + (num_heads,)
-
-        num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 * prefix_tokens
-        self.relative_position_bias_table = nn.Parameter(ops.zeros(num_relative_distance, num_heads))
-        self.register_buffer(
-            "relative_position_index",
-            gen_relative_position_index(self.window_size, class_token=prefix_tokens > 0).view(-1),
-            persistent=False,
-        )
-
-        self.init_weights()
-
-    def init_weights(self):
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-
-    def get_bias(self) -> mindspore.Tensor:
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index]
-        # win_h * win_w, win_h * win_w, num_heads
-        relative_position_bias = relative_position_bias.view(self.bias_shape).permute(2, 0, 1)
-        return relative_position_bias.unsqueeze(0).contiguous()
-
-    def forward(self, attn, shared_rel_pos: Optional[mindspore.Tensor] = None):
-        return attn + self.get_bias()
-
-
-def gen_relative_log_coords(
-        win_size: Tuple[int, int],
-        pretrained_win_size: Tuple[int, int] = (0, 0),
-        mode='swin',
-):
-    assert mode in ('swin', 'cr')
-    # as per official swin-v2 impl, supporting timm specific 'cr' log coords as well
-    relative_coords_h = ops.arange(-(win_size[0] - 1), win_size[0]).to(mindspore.float32)
-    relative_coords_w = ops.arange(-(win_size[1] - 1), win_size[1]).to(mindspore.float32)
-    relative_coords_table = ops.stack(ndgrid(relative_coords_h, relative_coords_w))
-    relative_coords_table = relative_coords_table.permute(1, 2, 0).contiguous()  # 2*Wh-1, 2*Ww-1, 2
-    if mode == 'swin':
-        if pretrained_win_size[0] > 0:
-            relative_coords_table[:, :, 0] /= (pretrained_win_size[0] - 1)
-            relative_coords_table[:, :, 1] /= (pretrained_win_size[1] - 1)
-        else:
-            relative_coords_table[:, :, 0] /= (win_size[0] - 1)
-            relative_coords_table[:, :, 1] /= (win_size[1] - 1)
-        relative_coords_table *= 8  # normalize to -8, 8
-        relative_coords_table = ops.sign(relative_coords_table) * ops.log2(
-            1.0 + relative_coords_table.abs()) / math.log2(8)
-    else:
-        # mode == 'cr'
-        relative_coords_table = ops.sign(relative_coords_table) * ops.log(
-            1.0 + relative_coords_table.abs())
-
-    return relative_coords_table
-
-
-class RelPosMlp(nn.Module):
-    """ Log-Coordinate Relative Position MLP
-    Based on ideas presented in Swin-V2 paper (https://arxiv.org/abs/2111.09883)
-
-    This impl covers the 'swin' implementation as well as two timm specific modes ('cr', and 'rw')
-    """
-    def __init__(
-            self,
-            window_size,
-            num_heads=8,
-            hidden_dim=128,
-            prefix_tokens=0,
-            mode='cr',
-            pretrained_window_size=(0, 0)
-    ):
-        super().__init__()
-        self.window_size = window_size
-        self.window_area = self.window_size[0] * self.window_size[1]
-        self.prefix_tokens = prefix_tokens
-        self.num_heads = num_heads
-        self.bias_shape = (self.window_area,) * 2 + (num_heads,)
-        if mode == 'swin':
-            self.bias_act = nn.Sigmoid()
-            self.bias_gain = 16
-            mlp_bias = (True, False)
-        else:
-            self.bias_act = nn.Identity()
-            self.bias_gain = None
-            mlp_bias = True
-
-        self.mlp = Mlp(
-            2,  # x, y
-            hidden_features=hidden_dim,
-            out_features=num_heads,
-            act_layer=nn.ReLU,
-            bias=mlp_bias,
-            drop=(0.125, 0.)
-        )
-
-        self.register_buffer(
-            "relative_position_index",
-            gen_relative_position_index(window_size).view(-1),
-            persistent=False)
-
-        # get relative_coords_table
-        self.register_buffer(
-            "rel_coords_log",
-            gen_relative_log_coords(window_size, pretrained_window_size, mode=mode),
-            persistent=False)
-
-    def get_bias(self) -> mindspore.Tensor:
-        relative_position_bias = self.mlp(self.rel_coords_log)
-        if self.relative_position_index is not None:
-            relative_position_bias = relative_position_bias.view(-1, self.num_heads)[self.relative_position_index]
-            relative_position_bias = relative_position_bias.view(self.bias_shape)
-        relative_position_bias = relative_position_bias.permute(2, 0, 1)
-        relative_position_bias = self.bias_act(relative_position_bias)
-        if self.bias_gain is not None:
-            relative_position_bias = self.bias_gain * relative_position_bias
-        if self.prefix_tokens:
-            relative_position_bias = F.pad(relative_position_bias, [self.prefix_tokens, 0, self.prefix_tokens, 0])
-        return relative_position_bias.unsqueeze(0).contiguous()
-
-    def forward(self, attn, shared_rel_pos: Optional[mindspore.Tensor] = None):
-        return attn + self.get_bias()
-
-
-def generate_lookup_tensor(
-        length: int,
-        max_relative_position: Optional[int] = None,
-):
-    """Generate a one_hot lookup tensor to reindex embeddings along one dimension.
-
-    Args:
-        length: the length to reindex to.
-        max_relative_position: the maximum relative position to consider.
-            Relative position embeddings for distances above this threshold
-            are zeroed out.
-    Returns:
-        a lookup Tensor of size [length, length, vocab_size] that satisfies
-            ret[n,m,v] = 1{m - n + max_relative_position = v}.
-    """
-    if max_relative_position is None:
-        max_relative_position = length - 1
-    # Return the cached lookup tensor, otherwise compute it and cache it.
-    vocab_size = 2 * max_relative_position + 1
-    ret = ops.zeros(length, length, vocab_size)
-    for i in range(length):
-        for x in range(length):
-            v = x - i + max_relative_position
-            if abs(x - i) > max_relative_position:
-                continue
-            ret[i, x, v] = 1
-    return ret
-
-
-def reindex_2d_einsum_lookup(
-        relative_position_tensor,
-        height: int,
-        width: int,
-        height_lookup: mindspore.Tensor,
-        width_lookup: mindspore.Tensor,
-) -> mindspore.Tensor:
-    """Reindex 2d relative position bias with 2 independent einsum lookups.
-
-    Adapted from:
-     https://github.com/google-research/maxvit/blob/2e06a7f1f70c76e64cd3dabe5cd1b8c1a23c9fb7/maxvit/models/attention_utils.py
-
-    Args:
-        relative_position_tensor: tensor of shape
-            [..., vocab_height, vocab_width, ...].
-        height: height to reindex to.
-        width: width to reindex to.
-        height_lookup: one-hot height lookup
-        width_lookup: one-hot width lookup
-    Returns:
-        reindexed_tensor: a Tensor of shape
-            [..., height * width, height * width, ...]
-    """
-    reindexed_tensor = ops.einsum('nhw,ixh->nixw', relative_position_tensor, height_lookup)
-    reindexed_tensor = ops.einsum('nixw,jyw->nijxy', reindexed_tensor, width_lookup)
-    area = height * width
-    return reindexed_tensor.reshape(relative_position_tensor.shape[0], area, area)
-
-
-class RelPosBiasTf(nn.Module):
-    """ Relative Position Bias Impl (Compatible with Tensorflow MaxViT models)
-    Adapted from:
-     https://github.com/google-research/maxvit/blob/2e06a7f1f70c76e64cd3dabe5cd1b8c1a23c9fb7/maxvit/models/attention_utils.py
-    """
-    def __init__(self, window_size, num_heads, prefix_tokens=0):
-        super().__init__()
-        assert prefix_tokens <= 1
-        self.window_size = window_size
-        self.window_area = window_size[0] * window_size[1]
-        self.num_heads = num_heads
-
-        vocab_height = 2 * window_size[0] - 1
-        vocab_width = 2 * window_size[1] - 1
-        self.bias_shape = (self.num_heads, vocab_height, vocab_width)
-        self.relative_position_bias_table = nn.Parameter(ops.zeros(self.bias_shape))
-        self.register_buffer('height_lookup', generate_lookup_tensor(window_size[0]), persistent=False)
-        self.register_buffer('width_lookup', generate_lookup_tensor(window_size[1]), persistent=False)
-        self.init_weights()
-
-    def init_weights(self):
-        nn.init.normal_(self.relative_position_bias_table, std=.02)
-
-    def get_bias(self) -> mindspore.Tensor:
-        # FIXME change to not use one-hot/einsum?
-        return reindex_2d_einsum_lookup(
-            self.relative_position_bias_table,
-            self.window_size[0],
-            self.window_size[1],
-            self.height_lookup,
-            self.width_lookup
-        )
-
-    def forward(self, attn, shared_rel_pos: Optional[mindspore.Tensor] = None):
-        return attn + self.get_bias()
diff --git a/mindnlp/mimm/layers/pos_embed_sincos.py b/mindnlp/mimm/layers/pos_embed_sincos.py
deleted file mode 100644
index 394adde85..000000000
--- a/mindnlp/mimm/layers/pos_embed_sincos.py
+++ /dev/null
@@ -1,428 +0,0 @@
-""" Sin-cos, fourier, rotary position embedding modules and functions
-
-Hacked together by / Copyright 2022 Ross Wightman
-"""
-import math
-from typing import List, Optional
-
-import mindspore
-from mindnlp.core import nn, ops
-
-from .grid import ndgrid
-
-
-def pixel_freq_bands(
-        num_bands: int,
-        max_freq: float = 224.,
-        linear_bands: bool = True,
-):
-    if linear_bands:
-        bands = ops.linspace(1.0, max_freq / 2, num_bands, dtype=mindspore.float32)
-    else:
-        bands = 2 ** ops.linspace(0, math.log(max_freq, 2) - 1, num_bands, dtype=mindspore.float32)
-    return bands * math.pi
-
-
-def freq_bands(
-        num_bands: int,
-        temperature: float = 10000.,
-        step: int = 2,
-) -> mindspore.Tensor:
-    exp = ops.arange(0, num_bands, step, dtype=mindspore.int64).to(mindspore.float32) / num_bands
-    bands = 1. / (temperature ** exp)
-    return bands
-
-
-def build_sincos2d_pos_embed(
-        feat_shape: List[int],
-        dim: int = 64,
-        temperature: float = 10000.,
-        reverse_coord: bool = False,
-        interleave_sin_cos: bool = False,
-        dtype: mindspore.dtype = mindspore.float32,
-) -> mindspore.Tensor:
-    """
-
-    Args:
-        feat_shape:
-        dim:
-        temperature:
-        reverse_coord: stack grid order W, H instead of H, W
-        interleave_sin_cos: sin, cos, sin, cos stack instead of sin, sin, cos, cos
-        dtype:
-
-    Returns:
-
-    """
-    assert dim % 4 == 0, 'Embed dimension must be divisible by 4 for sin-cos 2D position embedding'
-    pos_dim = dim // 4
-    bands = freq_bands(pos_dim, temperature=temperature, step=1)
-
-    if reverse_coord:
-        feat_shape = feat_shape[::-1]  # stack W, H instead of H, W
-    grid = ops.stack(ndgrid([
-        ops.arange(s, dtype=mindspore.int64).to(mindspore.float32)
-        for s in feat_shape
-    ])).flatten(1).transpose(0, 1)
-    pos2 = grid.unsqueeze(-1) * bands.unsqueeze(0)
-    # FIXME add support for unflattened spatial dim?
-
-    stack_dim = 2 if interleave_sin_cos else 1  # stack sin, cos, sin, cos  instead of sin sin cos cos
-    pos_emb = ops.stack([ops.sin(pos2), ops.cos(pos2)], dim=stack_dim).flatten(1)
-    return pos_emb.to(dtype=dtype)
-
-
-def build_fourier_pos_embed(
-        feat_shape: List[int],
-        bands: Optional[mindspore.Tensor] = None,
-        num_bands: int = 64,
-        max_res: int = 224,
-        temperature: float = 10000.,
-        linear_bands: bool = False,
-        include_grid: bool = False,
-        in_pixels: bool = True,
-        ref_feat_shape: Optional[List[int]] = None,
-        dtype: mindspore.dtype = mindspore.float32,
-) -> List[mindspore.Tensor]:
-    """
-
-    Args:
-        feat_shape: Feature shape for embedding.
-        bands: Pre-calculated frequency bands.
-        num_bands: Number of frequency bands (determines output dim).
-        max_res: Maximum resolution for pixel based freq.
-        temperature: Temperature for non-pixel freq.
-        linear_bands: Linear band spacing for pixel based freq.
-        include_grid: Include the spatial grid in output.
-        in_pixels: Output in pixel freq.
-        ref_feat_shape: Reference feature shape for resize / fine-tune.
-        dtype: Output dtype.
-
-    Returns:
-
-    """
-    if bands is None:
-        if in_pixels:
-            bands = pixel_freq_bands(
-                num_bands,
-                float(max_res),
-                linear_bands=linear_bands,
-            )
-        else:
-            bands = freq_bands(
-                num_bands,
-                temperature=temperature,
-                step=1,
-            )
-    else:
-        if dtype is None:
-            dtype = bands.dtype
-
-    if in_pixels:
-        t = [ops.linspace(-1., 1., steps=s, dtype=mindspore.float32) for s in feat_shape]
-    else:
-        t = [ops.arange(s, dtype=mindspore.int64).to(mindspore.float32) for s in feat_shape]
-
-    if ref_feat_shape is not None:
-        # eva's scheme for resizing rope embeddings (ref shape = pretrain)
-        t = [x / f * r for x, f, r in zip(t, feat_shape, ref_feat_shape)]
-
-    grid = ops.stack(ndgrid(t), dim=-1)
-    grid = grid.unsqueeze(-1)
-    pos = grid * bands
-
-    pos_sin, pos_cos = pos.sin().to(dtype=dtype), pos.cos().to(dtype)
-    out = [grid, pos_sin, pos_cos] if include_grid else [pos_sin, pos_cos]
-    return out
-
-
-class FourierEmbed(nn.Module):
-
-    def __init__(
-            self,
-            max_res: int = 224,
-            num_bands: int = 64,
-            concat_grid=True,
-            keep_spatial=False,
-    ):
-        super().__init__()
-        self.max_res = max_res
-        self.num_bands = num_bands
-        self.concat_grid = concat_grid
-        self.keep_spatial = keep_spatial
-        self.register_buffer(
-            'bands',
-            pixel_freq_bands(max_res, num_bands),
-            persistent=False,
-        )
-
-    def forward(self, x):
-        B, C = x.shape[:2]
-        feat_shape = x.shape[2:]
-        emb = build_fourier_pos_embed(
-            feat_shape,
-            self.bands,
-            include_grid=self.concat_grid,
-            dtype=x.dtype,
-        )
-        emb = ops.cat(emb, dim=-1)
-        emb = emb.transpose(-1, -2).flatten(len(feat_shape))
-        batch_expand = (B,) + (-1,) * (x.ndim - 1)
-
-        # FIXME support nD
-        if self.keep_spatial:
-            x = ops.cat([x, emb.unsqueeze(0).expand(batch_expand).permute(0, 3, 1, 2)], dim=1)
-        else:
-            x = ops.cat([x.permute(0, 2, 3, 1), emb.unsqueeze(0).expand(batch_expand)], dim=-1)
-            x = x.reshape(B, feat_shape.numel(), -1)
-
-        return x
-
-
-def rot(x):
-    return ops.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape)
-
-
-def apply_rot_embed(x: mindspore.Tensor, sin_emb, cos_emb):
-    if sin_emb.ndim == 3:
-        return x * cos_emb.unsqueeze(1).expand_as(x) + rot(x) * sin_emb.unsqueeze(1).expand_as(x)
-    return x * cos_emb + rot(x) * sin_emb
-
-
-def apply_rot_embed_list(x: List[mindspore.Tensor], sin_emb, cos_emb):
-    if isinstance(x, mindspore.Tensor):
-        x = [x]
-    return [t * cos_emb + rot(t) * sin_emb for t in x]
-
-
-def apply_rot_embed_cat(x: mindspore.Tensor, emb):
-    sin_emb, cos_emb = emb.tensor_split(2, -1)
-    if sin_emb.ndim == 3:
-        return x * cos_emb.unsqueeze(1).expand_as(x) + rot(x) * sin_emb.unsqueeze(1).expand_as(x)
-    return x * cos_emb + rot(x) * sin_emb
-
-
-def apply_keep_indices_nlc(x, pos_embed, keep_indices):
-    pos_embed = pos_embed.unsqueeze(0).expand(x.shape[0], -1, -1)
-    pos_embed = pos_embed.gather(1, keep_indices.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1]))
-    return pos_embed
-
-
-def build_rotary_pos_embed(
-        feat_shape: List[int],
-        bands: Optional[mindspore.Tensor] = None,
-        dim: int = 64,
-        max_res: int = 224,
-        temperature: float = 10000.,
-        linear_bands: bool = False,
-        in_pixels: bool = True,
-        ref_feat_shape: Optional[List[int]] = None,
-        dtype: mindspore.dtype = mindspore.float32,
-):
-    """
-
-    Args:
-        feat_shape: Spatial shape of the target tensor for embedding.
-        bands: Optional pre-generated frequency bands
-        dim: Output dimension of embedding tensor.
-        max_res: Maximum resolution for pixel mode.
-        temperature: Temperature (inv freq) for non-pixel mode
-        linear_bands: Linearly (instead of log) spaced bands for pixel mode
-        in_pixels: Pixel vs language (inv freq) mode.
-        dtype: Output dtype.
-
-    Returns:
-
-    """
-    sin_emb, cos_emb = build_fourier_pos_embed(
-        feat_shape,
-        bands=bands,
-        num_bands=dim // 4,
-        max_res=max_res,
-        temperature=temperature,
-        linear_bands=linear_bands,
-        in_pixels=in_pixels,
-        ref_feat_shape=ref_feat_shape,
-        dtype=dtype,
-    )
-    num_spatial_dim = 1
-    # this would be much nicer as a .numel() call to torch.Size(), but torchscript sucks
-    for x in feat_shape:
-        num_spatial_dim *= x
-    sin_emb = sin_emb.reshape(num_spatial_dim, -1).repeat_interleave(2, -1)
-    cos_emb = cos_emb.reshape(num_spatial_dim, -1).repeat_interleave(2, -1)
-    return sin_emb, cos_emb
-
-
-class RotaryEmbedding(nn.Module):
-    """ Rotary position embedding
-
-    NOTE: This is my initial attempt at impl rotary embedding for spatial use, it has not
-    been well tested, and will likely change. It will be moved to its own file.
-
-    The following impl/resources were referenced for this impl:
-    * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
-    * https://blog.eleuther.ai/rotary-embeddings/
-    """
-
-    def __init__(
-            self,
-            dim,
-            max_res=224,
-            temperature=10000,
-            in_pixels=True,
-            linear_bands: bool = False,
-            feat_shape: Optional[List[int]] = None,
-            ref_feat_shape: Optional[List[int]] = None,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.max_res = max_res
-        self.temperature = temperature
-        self.in_pixels = in_pixels
-        self.feat_shape = feat_shape
-        self.ref_feat_shape = ref_feat_shape
-
-        if feat_shape is None:
-            # only cache bands
-            if in_pixels:
-                bands = pixel_freq_bands(
-                    dim // 4,
-                    float(max_res),
-                    linear_bands=linear_bands,
-                )
-            else:
-                bands = freq_bands(
-                    dim // 4,
-                    temperature=temperature,
-                    step=1,
-                )
-            self.register_buffer(
-                'bands',
-                bands,
-                persistent=False,
-            )
-            self.pos_embed_sin = None
-            self.pos_embed_cos = None
-        else:
-            # cache full sin/cos embeddings if shape provided up front
-            emb_sin, emb_cos = build_rotary_pos_embed(
-                feat_shape=feat_shape,
-                dim=dim,
-                max_res=max_res,
-                linear_bands=linear_bands,
-                in_pixels=in_pixels,
-                ref_feat_shape=self.ref_feat_shape,
-            )
-            self.bands = None
-            self.register_buffer(
-                'pos_embed_sin',
-                emb_sin,
-                persistent=False,
-            )
-            self.register_buffer(
-                'pos_embed_cos',
-                emb_cos,
-                persistent=False,
-            )
-
-    def get_embed(self, shape: Optional[List[int]] = None):
-        if self.bands is not None:
-            # rebuild embeddings every call, use if target shape changes
-            assert shape is not None
-            return build_rotary_pos_embed(
-                shape,
-                self.bands,
-                in_pixels=self.in_pixels,
-            )
-        else:
-            return self.pos_embed_sin, self.pos_embed_cos
-
-    def forward(self, x):
-        # assuming channel-first tensor where spatial dim are >= 2
-        sin_emb, cos_emb = self.get_embed(x.shape[2:])
-        return apply_rot_embed(x, sin_emb, cos_emb)
-
-
-class RotaryEmbeddingCat(nn.Module):
-    """ Rotary position embedding w/ concatenatd sin & cos
-
-    The following impl/resources were referenced for this impl:
-    * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
-    * https://blog.eleuther.ai/rotary-embeddings/
-    """
-
-    def __init__(
-            self,
-            dim,
-            max_res=224,
-            temperature=10000,
-            in_pixels=True,
-            linear_bands: bool = False,
-            feat_shape: Optional[List[int]] = None,
-            ref_feat_shape: Optional[List[int]] = None,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.max_res = max_res
-        self.temperature = temperature
-        self.in_pixels = in_pixels
-        self.feat_shape = feat_shape
-        self.ref_feat_shape = ref_feat_shape
-
-        if feat_shape is None:
-            # only cache bands
-            if in_pixels:
-                bands = pixel_freq_bands(
-                    dim // 4,
-                    float(max_res),
-                    linear_bands=linear_bands,
-                )
-            else:
-                bands = freq_bands(
-                    dim // 4,
-                    temperature=temperature,
-                    step=1,
-                )
-            self.register_buffer(
-                'bands',
-                bands,
-                persistent=False,
-            )
-            self.pos_embed = None
-        else:
-            # cache full sin/cos embeddings if shape provided up front
-            embeds = build_rotary_pos_embed(
-                feat_shape=feat_shape,
-                dim=dim,
-                max_res=max_res,
-                linear_bands=linear_bands,
-                in_pixels=in_pixels,
-                ref_feat_shape=self.ref_feat_shape,
-            )
-            self.bands = None
-            self.register_buffer(
-                'pos_embed',
-                ops.cat(embeds, -1),
-                persistent=False,
-            )
-
-    def get_embed(self, shape: Optional[List[int]] = None):
-        if self.bands is not None and shape is not None:
-            # rebuild embeddings every call, use if target shape changes
-            embeds = build_rotary_pos_embed(
-                shape,
-                self.bands,
-                in_pixels=self.in_pixels,
-                ref_feat_shape=self.ref_feat_shape,
-            )
-            return ops.cat(embeds, -1)
-        elif self.pos_embed is not None:
-            return self.pos_embed
-        else:
-            assert False, "get_embed() requires pre-computed pos_embed or valid shape w/ pre-computed bands"
-
-    def forward(self, x):
-        # assuming channel-first tensor where spatial dim are >= 2
-        pos_embed = self.get_embed(x.shape[2:])
-        return apply_rot_embed_cat(x, pos_embed)
diff --git a/mindnlp/mimm/layers/selective_kernel.py b/mindnlp/mimm/layers/selective_kernel.py
deleted file mode 100644
index 42b699b2b..000000000
--- a/mindnlp/mimm/layers/selective_kernel.py
+++ /dev/null
@@ -1,117 +0,0 @@
-""" Selective Kernel Convolution/Attention
-
-Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from mindnlp.core import nn, ops
-
-from .conv_bn_act import ConvNormAct
-from .helpers import make_divisible
-
-
-def _kernel_valid(k):
-    if isinstance(k, (list, tuple)):
-        for ki in k:
-            return _kernel_valid(ki)
-    assert k >= 3 and k % 2
-
-
-class SelectiveKernelAttn(nn.Module):
-    def __init__(self, channels, num_paths=2, attn_channels=32, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
-        """ Selective Kernel Attention Module
-
-        Selective Kernel attention mechanism factored out into its own module.
-
-        """
-        super(SelectiveKernelAttn, self).__init__()
-        self.num_paths = num_paths
-        self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
-        self.bn = norm_layer(attn_channels)
-        self.act = act_layer(inplace=True)
-        self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        assert x.shape[1] == self.num_paths, ''
-        x = x.sum(1).mean((2, 3), keepdim=True)
-        x = self.fc_reduce(x)
-        x = self.bn(x)
-        x = self.act(x)
-        x = self.fc_select(x)
-        B, C, H, W = x.shape
-        x = x.view(B, self.num_paths, C // self.num_paths, H, W)
-        x = ops.softmax(x, dim=1)
-        return x
-
-
-class SelectiveKernel(nn.Module):
-
-    def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1,
-                 rd_ratio=1./16, rd_channels=None, rd_divisor=8, keep_3x3=True, split_input=True,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_layer=None):
-        """ Selective Kernel Convolution Module
-
-        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
-
-        Largest change is the input split, which divides the input channels across each convolution path, this can
-        be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps
-        the parameter count from ballooning when the convolutions themselves don't have groups, but still provides
-        a noteworthy increase in performance over similar param count models without this attention layer. -Ross W
-
-        Args:
-            in_channels (int):  module input (feature) channel count
-            out_channels (int):  module output (feature) channel count
-            kernel_size (int, list): kernel size for each convolution branch
-            stride (int): stride for convolutions
-            dilation (int): dilation for module as a whole, impacts dilation of each branch
-            groups (int): number of groups for each branch
-            rd_ratio (int, float): reduction factor for attention features
-            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
-            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
-                can be viewed as grouping by path, output expands to module out_channels count
-            act_layer (nn.Module): activation layer to use
-            norm_layer (nn.Module): batchnorm/norm layer to use
-            aa_layer (nn.Module): anti-aliasing module
-            drop_layer (nn.Module): spatial drop module in convs (drop block, etc)
-        """
-        super(SelectiveKernel, self).__init__()
-        out_channels = out_channels or in_channels
-        kernel_size = kernel_size or [3, 5]  # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation
-        _kernel_valid(kernel_size)
-        if not isinstance(kernel_size, list):
-            kernel_size = [kernel_size] * 2
-        if keep_3x3:
-            dilation = [dilation * (k - 1) // 2 for k in kernel_size]
-            kernel_size = [3] * len(kernel_size)
-        else:
-            dilation = [dilation] * len(kernel_size)
-        self.num_paths = len(kernel_size)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.split_input = split_input
-        if self.split_input:
-            assert in_channels % self.num_paths == 0
-            in_channels = in_channels // self.num_paths
-        groups = min(out_channels, groups)
-
-        conv_kwargs = dict(
-            stride=stride, groups=groups, act_layer=act_layer, norm_layer=norm_layer,
-            aa_layer=aa_layer, drop_layer=drop_layer)
-        self.paths = nn.ModuleList([
-            ConvNormAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
-            for k, d in zip(kernel_size, dilation)])
-
-        attn_channels = rd_channels or make_divisible(out_channels * rd_ratio, divisor=rd_divisor)
-        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
-
-    def forward(self, x):
-        if self.split_input:
-            x_split = ops.split(x, self.in_channels // self.num_paths, 1)
-            x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)]
-        else:
-            x_paths = [op(x) for op in self.paths]
-        x = ops.stack(x_paths, dim=1)
-        x_attn = self.attn(x)
-        x = x * x_attn
-        x = ops.sum(x, dim=1)
-        return x
diff --git a/mindnlp/mimm/layers/separable_conv.py b/mindnlp/mimm/layers/separable_conv.py
deleted file mode 100644
index 8c6337117..000000000
--- a/mindnlp/mimm/layers/separable_conv.py
+++ /dev/null
@@ -1,76 +0,0 @@
-""" Depthwise Separable Conv Modules
-
-Basic DWS convs. Other variations of DWS exist with batch norm or activations between the
-DW and PW convs such as the Depthwise modules in MobileNetV2 / EfficientNet and Xception.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from mindnlp.core import nn
-
-from .create_conv2d import create_conv2d
-from .create_norm_act import get_norm_act_layer
-
-
-class SeparableConvNormAct(nn.Module):
-    """ Separable Conv w/ trailing Norm and Activation
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
-                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU,
-                 apply_act=True, drop_layer=None):
-        super(SeparableConvNormAct, self).__init__()
-
-        self.conv_dw = create_conv2d(
-            in_channels, int(in_channels * channel_multiplier), kernel_size,
-            stride=stride, dilation=dilation, padding=padding, depthwise=True)
-
-        self.conv_pw = create_conv2d(
-            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
-
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        norm_kwargs = {"drop_layer": drop_layer} if drop_layer is not None else {}
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs)
-
-    @property
-    def in_channels(self):
-        return self.conv_dw.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv_pw.out_channels
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.conv_pw(x)
-        x = self.bn(x)
-        return x
-
-
-SeparableConvBnAct = SeparableConvNormAct
-
-
-class SeparableConv2d(nn.Module):
-    """ Separable Conv
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
-                 channel_multiplier=1.0, pw_kernel_size=1):
-        super(SeparableConv2d, self).__init__()
-
-        self.conv_dw = create_conv2d(
-            in_channels, int(in_channels * channel_multiplier), kernel_size,
-            stride=stride, dilation=dilation, padding=padding, depthwise=True)
-
-        self.conv_pw = create_conv2d(
-            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
-
-    @property
-    def in_channels(self):
-        return self.conv_dw.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv_pw.out_channels
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.conv_pw(x)
-        return x
diff --git a/mindnlp/mimm/layers/space_to_depth.py b/mindnlp/mimm/layers/space_to_depth.py
deleted file mode 100644
index e1c191825..000000000
--- a/mindnlp/mimm/layers/space_to_depth.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""space to depth"""
-from mindnlp.core import nn
-
-
-class SpaceToDepth(nn.Module):
-    bs: int
-
-    def __init__(self, block_size=4):
-        super().__init__()
-        assert block_size == 4
-        self.bs = block_size
-
-    def forward(self, x):
-        N, C, H, W = x.size()
-        x = x.view(N, C, H // self.bs, self.bs, W // self.bs, self.bs)  # (N, C, H//bs, bs, W//bs, bs)
-        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # (N, bs, bs, C, H//bs, W//bs)
-        x = x.view(N, C * self.bs * self.bs, H // self.bs, W // self.bs)  # (N, C*bs^2, H//bs, W//bs)
-        return x
-
-
-class DepthToSpace(nn.Module):
-
-    def __init__(self, block_size):
-        super().__init__()
-        self.bs = block_size
-
-    def forward(self, x):
-        N, C, H, W = x.size()
-        x = x.view(N, self.bs, self.bs, C // (self.bs ** 2), H, W)  # (N, bs, bs, C//bs^2, H, W)
-        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # (N, C//bs^2, H, bs, W, bs)
-        x = x.view(N, C // (self.bs ** 2), H * self.bs, W * self.bs)  # (N, C//bs^2, H * bs, W * bs)
-        return x
diff --git a/mindnlp/mimm/layers/split_attn.py b/mindnlp/mimm/layers/split_attn.py
deleted file mode 100644
index 211201017..000000000
--- a/mindnlp/mimm/layers/split_attn.py
+++ /dev/null
@@ -1,83 +0,0 @@
-""" Split Attention Conv2d (for ResNeSt Models)
-
-Paper: `ResNeSt: Split-Attention Networks` - /https://arxiv.org/abs/2004.08955
-
-Adapted from original PyTorch impl at https://github.com/zhanghang1989/ResNeSt
-
-Modified for torchscript compat, performance, and consistency with timm by Ross Wightman
-"""
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .helpers import make_divisible
-
-
-class RadixSoftmax(nn.Module):
-    def __init__(self, radix, cardinality):
-        super(RadixSoftmax, self).__init__()
-        self.radix = radix
-        self.cardinality = cardinality
-
-    def forward(self, x):
-        batch = x.size(0)
-        if self.radix > 1:
-            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
-            x = F.softmax(x, dim=1)
-            x = x.reshape(batch, -1)
-        else:
-            x = ops.sigmoid(x)
-        return x
-
-
-class SplitAttn(nn.Module):
-    """Split-Attention (aka Splat)
-    """
-    def __init__(self, in_channels, out_channels=None, kernel_size=3, stride=1, padding=None,
-                 dilation=1, groups=1, bias=False, radix=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8,
-                 act_layer=nn.ReLU, norm_layer=None, drop_layer=None, **kwargs):
-        super(SplitAttn, self).__init__()
-        out_channels = out_channels or in_channels
-        self.radix = radix
-        mid_chs = out_channels * radix
-        if rd_channels is None:
-            attn_chs = make_divisible(in_channels * radix * rd_ratio, min_value=32, divisor=rd_divisor)
-        else:
-            attn_chs = rd_channels * radix
-
-        padding = kernel_size // 2 if padding is None else padding
-        self.conv = nn.Conv2d(
-            in_channels, mid_chs, kernel_size, stride, padding, dilation,
-            groups=groups * radix, bias=bias, **kwargs)
-        self.bn0 = norm_layer(mid_chs) if norm_layer else nn.Identity()
-        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
-        self.act0 = act_layer(inplace=True)
-        self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups)
-        self.bn1 = norm_layer(attn_chs) if norm_layer else nn.Identity()
-        self.act1 = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, groups=groups)
-        self.rsoftmax = RadixSoftmax(radix, groups)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn0(x)
-        x = self.drop(x)
-        x = self.act0(x)
-
-        B, RC, H, W = x.shape
-        if self.radix > 1:
-            x = x.reshape((B, self.radix, RC // self.radix, H, W))
-            x_gap = x.sum(dim=1)
-        else:
-            x_gap = x
-        x_gap = x_gap.mean((2, 3), keepdim=True)
-        x_gap = self.fc1(x_gap)
-        x_gap = self.bn1(x_gap)
-        x_gap = self.act1(x_gap)
-        x_attn = self.fc2(x_gap)
-
-        x_attn = self.rsoftmax(x_attn).view(B, -1, 1, 1)
-        if self.radix > 1:
-            out = (x * x_attn.reshape((B, self.radix, RC // self.radix, 1, 1))).sum(dim=1)
-        else:
-            out = x * x_attn
-        return out.contiguous()
diff --git a/mindnlp/mimm/layers/split_batchnorm.py b/mindnlp/mimm/layers/split_batchnorm.py
deleted file mode 100644
index dc5a15ccc..000000000
--- a/mindnlp/mimm/layers/split_batchnorm.py
+++ /dev/null
@@ -1,74 +0,0 @@
-""" Split BatchNorm
-
-A PyTorch BatchNorm layer that splits input batch into N equal parts and passes each through
-a separate BN layer. The first split is passed through the parent BN layers with weight/bias
-keys the same as the original BN. All other splits pass through BN sub-layers under the '.aux_bn'
-namespace.
-
-This allows easily removing the auxiliary BN layers after training to efficiently
-achieve the 'Auxiliary BatchNorm' as described in the AdvProp Paper, section 4.2,
-'Disentangled Learning via An Auxiliary BN'
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import mindspore
-from mindnlp.core import nn, ops
-
-class SplitBatchNorm2d(nn.BatchNorm2d):
-
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
-                 track_running_stats=True, num_splits=2):
-        super().__init__(num_features, eps, momentum, affine, track_running_stats)
-        assert num_splits > 1, 'Should have at least one aux BN layer (num_splits at least 2)'
-        self.num_splits = num_splits
-        self.aux_bn = nn.ModuleList([
-            nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) for _ in range(num_splits - 1)])
-
-    def forward(self, input: mindspore.Tensor):
-        if self.training:  # aux BN only relevant while training
-            split_size = input.shape[0] // self.num_splits
-            assert input.shape[0] == split_size * self.num_splits, "batch size must be evenly divisible by num_splits"
-            split_input = input.split(split_size)
-            x = [super().forward(split_input[0])]
-            for i, a in enumerate(self.aux_bn):
-                x.append(a(split_input[i + 1]))
-            return ops.cat(x, dim=0)
-        else:
-            return super().forward(input)
-
-
-def convert_splitbn_model(module, num_splits=2):
-    """
-    Recursively traverse module and its children to replace all instances of
-    ``torch.nn.modules.batchnorm._BatchNorm`` with `SplitBatchnorm2d`.
-    Args:
-        module (torch.nn.Module): input module
-        num_splits: number of separate batchnorm layers to split input across
-    Example::
-        >>> # model is an instance of torch.nn.Module
-        >>> model = timm.models.convert_splitbn_model(model, num_splits=2)
-    """
-    mod = module
-    if isinstance(module, nn.modules.instancenorm._InstanceNorm):
-        return module
-    if isinstance(module, nn.modules.batchnorm._BatchNorm):
-        mod = SplitBatchNorm2d(
-            module.num_features, module.eps, module.momentum, module.affine,
-            module.track_running_stats, num_splits=num_splits)
-        mod.running_mean = module.running_mean
-        mod.running_var = module.running_var
-        mod.num_batches_tracked = module.num_batches_tracked
-        if module.affine:
-            mod.weight.data = module.weight.data.clone().detach()
-            mod.bias.data = module.bias.data.clone().detach()
-        for aux in mod.aux_bn:
-            aux.running_mean = module.running_mean.clone()
-            aux.running_var = module.running_var.clone()
-            aux.num_batches_tracked = module.num_batches_tracked.clone()
-            if module.affine:
-                aux.weight.data = module.weight.data.clone().detach()
-                aux.bias.data = module.bias.data.clone().detach()
-    for name, child in module.named_children():
-        mod.add_module(name, convert_splitbn_model(child, num_splits=num_splits))
-    del module
-    return mod
diff --git a/mindnlp/mimm/layers/squeeze_excite.py b/mindnlp/mimm/layers/squeeze_excite.py
deleted file mode 100644
index 28e940427..000000000
--- a/mindnlp/mimm/layers/squeeze_excite.py
+++ /dev/null
@@ -1,102 +0,0 @@
-""" Squeeze-and-Excitation Channel Attention
-
-An SE implementation originally based on PyTorch SE-Net impl.
-Has since evolved with additional functionality / configuration.
-
-Paper: `Squeeze-and-Excitation Networks` - https://arxiv.org/abs/1709.01507
-
-Also included is Effective Squeeze-Excitation (ESE).
-Paper: `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from mindnlp.core import nn
-
-from .create_act import create_act_layer
-from .helpers import make_divisible
-
-
-class SEModule(nn.Module):
-    """ SE Module as defined in original SE-Nets with a few additions
-    Additions include:
-        * divisor can be specified to keep channels % div == 0 (default: 8)
-        * reduction channels can be specified directly by arg (if rd_channels is set)
-        * reduction channels can be specified by float rd_ratio (default: 1/16)
-        * global max pooling can be added to the squeeze aggregation
-        * customizable activation, normalization, and gate layer
-    """
-    def __init__(
-            self, channels, rd_ratio=1. / 16, rd_channels=None, rd_divisor=8, add_maxpool=False,
-            bias=True, act_layer=nn.ReLU, norm_layer=None, gate_layer='sigmoid'):
-        super(SEModule, self).__init__()
-        self.add_maxpool = add_maxpool
-        if not rd_channels:
-            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
-        self.fc1 = nn.Conv2d(channels, rd_channels, kernel_size=1, bias=bias)
-        self.bn = norm_layer(rd_channels) if norm_layer else nn.Identity()
-        self.act = create_act_layer(act_layer, inplace=True)
-        self.fc2 = nn.Conv2d(rd_channels, channels, kernel_size=1, bias=bias)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        if self.add_maxpool:
-            # experimental codepath, may remove or change
-            x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True)
-        x_se = self.fc1(x_se)
-        x_se = self.act(self.bn(x_se))
-        x_se = self.fc2(x_se)
-        return x * self.gate(x_se)
-
-
-SqueezeExcite = SEModule  # alias
-
-
-class EffectiveSEModule(nn.Module):
-    """ 'Effective Squeeze-Excitation
-    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
-    """
-    def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid', **_):
-        super(EffectiveSEModule, self).__init__()
-        self.add_maxpool = add_maxpool
-        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        if self.add_maxpool:
-            # experimental codepath, may remove or change
-            x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True)
-        x_se = self.fc(x_se)
-        return x * self.gate(x_se)
-
-
-EffectiveSqueezeExcite = EffectiveSEModule  # alias
-
-
-class SqueezeExciteCl(nn.Module):
-    """ SE Module as defined in original SE-Nets with a few additions
-    Additions include:
-        * divisor can be specified to keep channels % div == 0 (default: 8)
-        * reduction channels can be specified directly by arg (if rd_channels is set)
-        * reduction channels can be specified by float rd_ratio (default: 1/16)
-        * global max pooling can be added to the squeeze aggregation
-        * customizable activation, normalization, and gate layer
-    """
-    def __init__(
-            self, channels, rd_ratio=1. / 16, rd_channels=None, rd_divisor=8,
-            bias=True, act_layer=nn.ReLU, gate_layer='sigmoid'):
-        super().__init__()
-        if not rd_channels:
-            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
-        self.fc1 = nn.Linear(channels, rd_channels, bias=bias)
-        self.act = create_act_layer(act_layer, inplace=True)
-        self.fc2 = nn.Linear(rd_channels, channels, bias=bias)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((1, 2), keepdim=True)
-        x_se = self.fc1(x_se)
-        x_se = self.act(x_se)
-        x_se = self.fc2(x_se)
-        return x * self.gate(x_se)
diff --git a/mindnlp/mimm/layers/std_conv.py b/mindnlp/mimm/layers/std_conv.py
deleted file mode 100644
index 5dc6f7ca3..000000000
--- a/mindnlp/mimm/layers/std_conv.py
+++ /dev/null
@@ -1,132 +0,0 @@
-""" Convolution with Weight Standardization (StdConv and ScaledStdConv)
-
-StdConv:
-@article{weightstandardization,
-  author    = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille},
-  title     = {Weight Standardization},
-  journal   = {arXiv preprint arXiv:1903.10520},
-  year      = {2019},
-}
-Code: https://github.com/joe-siyuan-qiao/WeightStandardization
-
-ScaledStdConv:
-Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-    - https://arxiv.org/abs/2101.08692
-Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets
-
-Hacked together by / copyright Ross Wightman, 2021.
-"""
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from .padding import get_padding, get_padding_value, pad_same
-
-
-class StdConv2d(nn.Conv2d):
-    """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
-
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
-    """
-    def __init__(
-            self, in_channel, out_channels, kernel_size, stride=1, padding=None,
-            dilation=1, groups=1, bias=False, eps=1e-6):
-        if padding is None:
-            padding = get_padding(kernel_size, stride, dilation)
-        super().__init__(
-            in_channel, out_channels, kernel_size, stride=stride,
-            padding=padding, dilation=dilation, groups=groups, bias=bias)
-        self.eps = eps
-
-    def forward(self, x):
-        weight = F.batch_norm(
-            self.weight.reshape(1, self.out_channels, -1), None, None,
-            training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
-        x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
-class StdConv2dSame(nn.Conv2d):
-    """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
-
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
-    """
-    def __init__(
-            self, in_channel, out_channels, kernel_size, stride=1, padding='SAME',
-            dilation=1, groups=1, bias=False, eps=1e-6):
-        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
-        super().__init__(
-            in_channel, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.same_pad = is_dynamic
-        self.eps = eps
-
-    def forward(self, x):
-        if self.same_pad:
-            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
-        weight = F.batch_norm(
-            self.weight.reshape(1, self.out_channels, -1), None, None,
-            training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
-        x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
-class ScaledStdConv2d(nn.Conv2d):
-    """Conv2d layer with Scaled Weight Standardization.
-
-    Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
-        https://arxiv.org/abs/2101.08692
-
-    NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
-    """
-
-    def __init__(
-            self, in_channels, out_channels, kernel_size, stride=1, padding=None,
-            dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0):
-        if padding is None:
-            padding = get_padding(kernel_size, stride, dilation)
-        super().__init__(
-            in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.gain = nn.Parameter(ops.full((self.out_channels, 1, 1, 1), gain_init))
-        self.scale = gamma * self.weight[0].numel() ** -0.5  # gamma * 1 / sqrt(fan-in)
-        self.eps = eps
-
-    def forward(self, x):
-        weight = F.batch_norm(
-            self.weight.reshape(1, self.out_channels, -1), None, None,
-            weight=(self.gain * self.scale).view(-1),
-            training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
-        return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-
-
-class ScaledStdConv2dSame(nn.Conv2d):
-    """Conv2d layer with Scaled Weight Standardization and Tensorflow-like SAME padding support
-
-    Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
-        https://arxiv.org/abs/2101.08692
-
-    NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
-    """
-
-    def __init__(
-            self, in_channels, out_channels, kernel_size, stride=1, padding='SAME',
-            dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0):
-        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
-        super().__init__(
-            in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.gain = nn.Parameter(ops.full((self.out_channels, 1, 1, 1), gain_init))
-        self.scale = gamma * self.weight[0].numel() ** -0.5
-        self.same_pad = is_dynamic
-        self.eps = eps
-
-    def forward(self, x):
-        if self.same_pad:
-            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
-        weight = F.batch_norm(
-            self.weight.reshape(1, self.out_channels, -1), None, None,
-            weight=(self.gain * self.scale).view(-1),
-            training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
-        return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
diff --git a/mindnlp/mimm/layers/test_time_pool.py b/mindnlp/mimm/layers/test_time_pool.py
deleted file mode 100644
index 9a80bb6cd..000000000
--- a/mindnlp/mimm/layers/test_time_pool.py
+++ /dev/null
@@ -1,51 +0,0 @@
-""" Test Time Pooling (Average-Max Pool)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import logging
-from mindnlp.core import nn
-from mindnlp.core.nn import functional as F
-
-from .adaptive_avgmax_pool import adaptive_avgmax_pool2d
-
-
-_logger = logging.getLogger(__name__)
-
-
-class TestTimePoolHead(nn.Module):
-    def __init__(self, base, original_pool=7):
-        super(TestTimePoolHead, self).__init__()
-        self.base = base
-        self.original_pool = original_pool
-        base_fc = self.base.get_classifier()
-        if isinstance(base_fc, nn.Conv2d):
-            self.fc = base_fc
-        else:
-            self.fc = nn.Conv2d(
-                self.base.num_features, self.base.num_classes, kernel_size=1, bias=True)
-            self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size()))
-            self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size()))
-        self.base.reset_classifier(0)  # delete original fc layer
-
-    def forward(self, x):
-        x = self.base.forward_features(x)
-        x = F.avg_pool2d(x, kernel_size=self.original_pool, stride=1)
-        x = self.fc(x)
-        x = adaptive_avgmax_pool2d(x, 1)
-        return x.view(x.size(0), -1)
-
-
-def apply_test_time_pool(model, config, use_test_size=False):
-    test_time_pool = False
-    if not hasattr(model, 'default_cfg') or not model.default_cfg:
-        return model, False
-    if use_test_size and 'test_input_size' in model.default_cfg:
-        df_input_size = model.default_cfg['test_input_size']
-    else:
-        df_input_size = model.default_cfg['input_size']
-    if config['input_size'][-1] > df_input_size[-1] and config['input_size'][-2] > df_input_size[-2]:
-        _logger.info(f"Target input size {str(config['input_size'][-2:])} > pretrained default {str(df_input_size[-2:])}, using test time pooling")
-        model = TestTimePoolHead(model, original_pool=model.default_cfg['pool_size'])
-        test_time_pool = True
-    return model, test_time_pool
diff --git a/mindnlp/mimm/layers/typing.py b/mindnlp/mimm/layers/typing.py
deleted file mode 100644
index 7b28cacbb..000000000
--- a/mindnlp/mimm/layers/typing.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""typing"""
-from typing import Callable, Tuple, Type, Union
-
-from mindnlp.core.nn import Module
-
-
-LayerType = Union[str, Callable, Type[Module]]
-PadType = Union[str, int, Tuple[int, int]]
diff --git a/mindnlp/mimm/layers/weight_init.py b/mindnlp/mimm/layers/weight_init.py
deleted file mode 100644
index 4aff1fcc9..000000000
--- a/mindnlp/mimm/layers/weight_init.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""weight init"""
-import math
-import warnings
-from mindnlp.core import nn, no_grad
-from mindnlp.core.nn.init import _calculate_fan_in_and_fan_out
-
-
-def _trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                      "The distribution of values may be incorrect.",
-                      stacklevel=2)
-
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
-
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    tensor.erfinv_()
-
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.))
-    tensor.add_(mean)
-
-    # Clamp to ensure it's in the proper range
-    tensor.clamp_(min=a, max=b)
-    return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-
-    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
-    applied while sampling the normal with mean/std applied, therefore a, b args
-    should be adjusted to match the range of mean, std args.
-
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    with no_grad():
-        return _trunc_normal_(tensor, mean, std, a, b)
-
-
-def trunc_normal_tf_(tensor, mean=0., std=1., a=-2., b=2.):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
-    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
-
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    with no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-    return tensor
-
-
-def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == 'fan_in':
-        denom = fan_in
-    elif mode == 'fan_out':
-        denom = fan_out
-    elif mode == 'fan_avg':
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / .87962566103423978)
-    elif distribution == "normal":
-        with no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
-
-
-def init_weight_vit(
-        module: nn.Module,
-        name: str,
-        init_bias: float = 0.02,
-        head_bias: float = 0.,
-        classifier_name: str = 'head'
-):
-    if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
-        if name.startswith(classifier_name):
-            nn.init.zeros_(module.weight)
-            nn.init.constant_(module.bias, head_bias)
-        else:
-            nn.init.trunc_normal_(module.weight, std=0.02)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                nn.init.constant_(module.bias, init_bias)
-    elif hasattr(module, 'init_weights'):
-        module.init_weights()
-
-
-def init_weight_jax(
-        module: nn.Module,
-        name: str,
-        head_bias: float = 0.,
-        classifier_name: str = 'head',
-):
-    if isinstance(module, nn.Linear):
-        if name.startswith(classifier_name):
-            nn.init.zeros_(module.weight)
-            nn.init.constant_(module.bias, head_bias)
-        else:
-            nn.init.xavier_uniform_(module.weight)
-            if module.bias is not None:
-                if 'mlp' in name:
-                    nn.init.normal_(module.bias, std=1e-6)
-                else:
-                    nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Conv2d):
-        lecun_normal_(module.weight)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif hasattr(module, 'init_weights'):
-        module.init_weights()
diff --git a/mindnlp/mimm/models/__init__.py b/mindnlp/mimm/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/mimm/models/_builder.py b/mindnlp/mimm/models/_builder.py
deleted file mode 100644
index 12aedcb73..000000000
--- a/mindnlp/mimm/models/_builder.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# import dataclasses
-# import logging
-# import os
-# from copy import deepcopy
-# from typing import Any, Callable, Dict, List, Optional, Tuple
-
-# from mindnlp.core import nn
-# from mindnlp.core.hub import load_state_dict_from_url
-
-# from mindnlp.mimm.models._features import FeatureListNet, FeatureDictNet, FeatureHookNet, FeatureGetterNet
-# from mindnlp.mimm.models._features_fx import FeatureGraphNet
-# from mindnlp.mimm.models._helpers import load_state_dict
-# from mindnlp.mimm.models._hub import has_hf_hub, download_cached_file, check_cached_file, load_state_dict_from_hf,\
-#     load_custom_from_hf
-# from mindnlp.mimm.models._manipulate import adapt_input_conv
-# from mindnlp.mimm.models._pretrained import PretrainedCfg
-# from mindnlp.mimm.models._prune import adapt_model_from_file
-# from mindnlp.mimm.models._registry import get_pretrained_cfg
-
-# _logger = logging.getLogger(__name__)
-
-# # Global variables for rarely used pretrained checkpoint download progress and hash check.
-# # Use set_pretrained_download_progress / set_pretrained_check_hash functions to toggle.
-# _DOWNLOAD_PROGRESS = False
-# _CHECK_HASH = False
-# _USE_OLD_CACHE = int(os.environ.get('TIMM_USE_OLD_CACHE', 0)) > 0
-
-# __all__ = ['set_pretrained_download_progress', 'set_pretrained_check_hash', 'load_custom_pretrained', 'load_pretrained',
-#            'pretrained_cfg_for_features', 'resolve_pretrained_cfg', 'build_model_with_cfg']
-
-
-# def _resolve_pretrained_source(pretrained_cfg):
-#     cfg_source = pretrained_cfg.get('source', '')
-#     pretrained_url = pretrained_cfg.get('url', None)
-#     pretrained_file = pretrained_cfg.get('file', None)
-#     pretrained_sd = pretrained_cfg.get('state_dict', None)
-#     hf_hub_id = pretrained_cfg.get('hf_hub_id', None)
-
-#     # resolve where to load pretrained weights from
-#     load_from = ''
-#     pretrained_loc = ''
-#     if cfg_source == 'hf-hub' and has_hf_hub(necessary=True):
-#         # hf-hub specified as source via model identifier
-#         load_from = 'hf-hub'
-#         assert hf_hub_id
-#         pretrained_loc = hf_hub_id
-#     else:
-#         # default source == timm or unspecified
-#         if pretrained_sd:
-#             # direct state_dict pass through is the highest priority
-#             load_from = 'state_dict'
-#             pretrained_loc = pretrained_sd
-#             assert isinstance(pretrained_loc, dict)
-#         elif pretrained_file:
-#             # file load override is the second-highest priority if set
-#             load_from = 'file'
-#             pretrained_loc = pretrained_file
-#         else:
-#             old_cache_valid = False
-#             if _USE_OLD_CACHE:
-#                 # prioritized old cached weights if exists and env var enabled
-#                 old_cache_valid = check_cached_file(pretrained_url) if pretrained_url else False
-#             if not old_cache_valid and hf_hub_id and has_hf_hub(necessary=True):
-#                 # hf-hub available as alternate weight source in default_cfg
-#                 load_from = 'hf-hub'
-#                 pretrained_loc = hf_hub_id
-#             elif pretrained_url:
-#                 load_from = 'url'
-#                 pretrained_loc = pretrained_url
-
-#     if load_from == 'hf-hub' and pretrained_cfg.get('hf_hub_filename', None):
-#         # if a filename override is set, return tuple for location w/ (hub_id, filename)
-#         pretrained_loc = pretrained_loc, pretrained_cfg['hf_hub_filename']
-#     return load_from, pretrained_loc
-
-
-# def set_pretrained_download_progress(enable=True):
-#     """ Set download progress for pretrained weights on/off (globally). """
-#     global _DOWNLOAD_PROGRESS
-#     _DOWNLOAD_PROGRESS = enable
-
-
-# def set_pretrained_check_hash(enable=True):
-#     """ Set hash checking for pretrained weights on/off (globally). """
-#     global _CHECK_HASH
-#     _CHECK_HASH = enable
-
-
-# def load_custom_pretrained(
-#         model: nn.Module,
-#         pretrained_cfg: Optional[Dict] = None,
-#         load_fn: Optional[Callable] = None,
-# ):
-#     r"""Loads a custom (read non .pth) weight file
-
-#     Downloads checkpoint file into cache-dir like torch.hub based loaders, but calls
-#     a passed in custom load fun, or the `load_pretrained` model member fn.
-
-#     If the object is already present in `model_dir`, it's deserialized and returned.
-#     The default value of `model_dir` is ``<hub_dir>/checkpoints`` where
-#     `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`.
-
-#     Args:
-#         model: The instantiated model to load weights into
-#         pretrained_cfg (dict): Default pretrained model cfg
-#         load_fn: An external standalone fn that loads weights into provided model, otherwise a fn named
-#             'laod_pretrained' on the model will be called if it exists
-#     """
-#     pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None)
-#     if not pretrained_cfg:
-#         _logger.warning("Invalid pretrained config, cannot load weights.")
-#         return
-
-#     load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg)
-#     if not load_from:
-#         _logger.warning("No pretrained weights exist for this model. Using random initialization.")
-#         return
-#     if load_from == 'hf-hub':
-#         _logger.warning("Hugging Face hub not currently supported for custom load pretrained models.")
-#     elif load_from == 'url':
-#         pretrained_loc = download_cached_file(
-#             pretrained_loc,
-#             check_hash=_CHECK_HASH,
-#             progress=_DOWNLOAD_PROGRESS,
-#         )
-
-#     if load_fn is not None:
-#         load_fn(model, pretrained_loc)
-#     elif hasattr(model, 'load_pretrained'):
-#         model.load_pretrained(pretrained_loc)
-#     else:
-#         _logger.warning("Valid function to load pretrained weights is not available, using random initialization.")
-
-
-# def load_pretrained(
-#         model: nn.Module,
-#         pretrained_cfg: Optional[Dict] = None,
-#         num_classes: int = 1000,
-#         in_chans: int = 3,
-#         filter_fn: Optional[Callable] = None,
-#         strict: bool = True,
-# ):
-#     """ Load pretrained checkpoint
-
-#     Args:
-#         model (nn.Module) : PyTorch model module
-#         pretrained_cfg (Optional[Dict]): configuration for pretrained weights / target dataset
-#         num_classes (int): num_classes for target model
-#         in_chans (int): in_chans for target model
-#         filter_fn (Optional[Callable]): state_dict filter fn for load (takes state_dict, model as args)
-#         strict (bool): strict load of checkpoint
-
-#     """
-#     pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None)
-#     if not pretrained_cfg:
-#         raise RuntimeError("Invalid pretrained config, cannot load weights. Use `pretrained=False` for random init.")
-
-#     load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg)
-#     if load_from == 'state_dict':
-#         _logger.info('Loading pretrained weights from state dict')
-#         state_dict = pretrained_loc  # pretrained_loc is the actual state dict for this override
-#     elif load_from == 'file':
-#         _logger.info(f'Loading pretrained weights from file ({pretrained_loc})')
-#         if pretrained_cfg.get('custom_load', False):
-#             model.load_pretrained(pretrained_loc)
-#             return
-#         else:
-#             state_dict = load_state_dict(pretrained_loc)
-#     elif load_from == 'url':
-#         _logger.info(f'Loading pretrained weights from url ({pretrained_loc})')
-#         if pretrained_cfg.get('custom_load', False):
-#             pretrained_loc = download_cached_file(
-#                 pretrained_loc,
-#                 progress=_DOWNLOAD_PROGRESS,
-#                 check_hash=_CHECK_HASH,
-#             )
-#             model.load_pretrained(pretrained_loc)
-#             return
-#         else:
-#             try:
-#                 state_dict = load_state_dict_from_url(
-#                     pretrained_loc,
-#                     progress=_DOWNLOAD_PROGRESS,
-#                     check_hash=_CHECK_HASH,
-#                     weights_only=True,
-#                 )
-#             except TypeError:
-#                 state_dict = load_state_dict_from_url(
-#                     pretrained_loc,
-#                     progress=_DOWNLOAD_PROGRESS,
-#                     check_hash=_CHECK_HASH,
-#                 )
-#     elif load_from == 'hf-hub':
-#         _logger.info(f'Loading pretrained weights from Hugging Face hub ({pretrained_loc})')
-#         if isinstance(pretrained_loc, (list, tuple)):
-#             custom_load = pretrained_cfg.get('custom_load', False)
-#             if isinstance(custom_load, str) and custom_load == 'hf':
-#                 load_custom_from_hf(*pretrained_loc, model)
-#                 return
-#             else:
-#                 state_dict = load_state_dict_from_hf(*pretrained_loc)
-#         else:
-#             state_dict = load_state_dict_from_hf(pretrained_loc, weights_only=True)
-#     else:
-#         model_name = pretrained_cfg.get('architecture', 'this model')
-#         raise RuntimeError(f"No pretrained weights exist for {model_name}. Use `pretrained=False` for random init.")
-
-#     if filter_fn is not None:
-#         try:
-#             state_dict = filter_fn(state_dict, model)
-#         except TypeError as e:
-#             # for backwards compat with filter fn that take one arg
-#             state_dict = filter_fn(state_dict)
-
-#     input_convs = pretrained_cfg.get('first_conv', None)
-#     if input_convs is not None and in_chans != 3:
-#         if isinstance(input_convs, str):
-#             input_convs = (input_convs,)
-#         for input_conv_name in input_convs:
-#             weight_name = input_conv_name + '.weight'
-#             try:
-#                 state_dict[weight_name] = adapt_input_conv(in_chans, state_dict[weight_name])
-#                 _logger.info(
-#                     f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)')
-#             except NotImplementedError as e:
-#                 del state_dict[weight_name]
-#                 strict = False
-#                 _logger.warning(
-#                     f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.')
-
-#     classifiers = pretrained_cfg.get('classifier', None)
-#     label_offset = pretrained_cfg.get('label_offset', 0)
-#     if classifiers is not None:
-#         if isinstance(classifiers, str):
-#             classifiers = (classifiers,)
-#         if num_classes != pretrained_cfg['num_classes']:
-#             for classifier_name in classifiers:
-#                 # completely discard fully connected if model num_classes doesn't match pretrained weights
-#                 state_dict.pop(classifier_name + '.weight', None)
-#                 state_dict.pop(classifier_name + '.bias', None)
-#             strict = False
-#         elif label_offset > 0:
-#             for classifier_name in classifiers:
-#                 # special case for pretrained weights with an extra background class in pretrained weights
-#                 classifier_weight = state_dict[classifier_name + '.weight']
-#                 state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:]
-#                 classifier_bias = state_dict[classifier_name + '.bias']
-#                 state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
-
-#     load_result = model.load_state_dict(state_dict, strict=strict)
-#     if load_result.missing_keys:
-#         _logger.info(
-#             f'Missing keys ({", ".join(load_result.missing_keys)}) discovered while loading pretrained weights.'
-#             f' This is expected if model is being adapted.')
-#     if load_result.unexpected_keys:
-#         _logger.warning(
-#             f'Unexpected keys ({", ".join(load_result.unexpected_keys)}) found while loading pretrained weights.'
-#             f' This may be expected if model is being adapted.')
-
-
-# def pretrained_cfg_for_features(pretrained_cfg):
-#     pretrained_cfg = deepcopy(pretrained_cfg)
-#     # remove default pretrained cfg fields that don't have much relevance for feature backbone
-#     to_remove = ('num_classes', 'classifier', 'global_pool')  # add default final pool size?
-#     for tr in to_remove:
-#         pretrained_cfg.pop(tr, None)
-#     return pretrained_cfg
-
-
-# def _filter_kwargs(kwargs, names):
-#     if not kwargs or not names:
-#         return
-#     for n in names:
-#         kwargs.pop(n, None)
-
-
-# def _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter):
-#     """ Update the default_cfg and kwargs before passing to model
-
-#     Args:
-#         pretrained_cfg: input pretrained cfg (updated in-place)
-#         kwargs: keyword args passed to model build fn (updated in-place)
-#         kwargs_filter: keyword arg keys that must be removed before model __init__
-#     """
-#     # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
-#     default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
-#     if pretrained_cfg.get('fixed_input_size', False):
-#         # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
-#         default_kwarg_names += ('img_size',)
-
-#     for n in default_kwarg_names:
-#         # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
-#         # pretrained_cfg has one input_size=(C, H ,W) entry
-#         if n == 'img_size':
-#             input_size = pretrained_cfg.get('input_size', None)
-#             if input_size is not None:
-#                 assert len(input_size) == 3
-#                 kwargs.setdefault(n, input_size[-2:])
-#         elif n == 'in_chans':
-#             input_size = pretrained_cfg.get('input_size', None)
-#             if input_size is not None:
-#                 assert len(input_size) == 3
-#                 kwargs.setdefault(n, input_size[0])
-#         elif n == 'num_classes':
-#             default_val = pretrained_cfg.get(n, None)
-#             # if default is < 0, don't pass through to model
-#             if default_val is not None and default_val >= 0:
-#                 kwargs.setdefault(n, pretrained_cfg[n])
-#         else:
-#             default_val = pretrained_cfg.get(n, None)
-#             if default_val is not None:
-#                 kwargs.setdefault(n, pretrained_cfg[n])
-
-#     # Filter keyword args for task specific model variants (some 'features only' models, etc.)
-#     _filter_kwargs(kwargs, names=kwargs_filter)
-
-
-# def resolve_pretrained_cfg(
-#         variant: str,
-#         pretrained_cfg=None,
-#         pretrained_cfg_overlay=None,
-# ) -> PretrainedCfg:
-#     model_with_tag = variant
-#     pretrained_tag = None
-#     if pretrained_cfg:
-#         if isinstance(pretrained_cfg, dict):
-#             # pretrained_cfg dict passed as arg, validate by converting to PretrainedCfg
-#             pretrained_cfg = PretrainedCfg(**pretrained_cfg)
-#         elif isinstance(pretrained_cfg, str):
-#             pretrained_tag = pretrained_cfg
-#             pretrained_cfg = None
-
-#     # fallback to looking up pretrained cfg in model registry by variant identifier
-#     if not pretrained_cfg:
-#         if pretrained_tag:
-#             model_with_tag = '.'.join([variant, pretrained_tag])
-#         pretrained_cfg = get_pretrained_cfg(model_with_tag)
-
-#     if not pretrained_cfg:
-#         _logger.warning(
-#             f"No pretrained configuration specified for {model_with_tag} model. Using a default."
-#             f" Please add a config to the model pretrained_cfg registry or pass explicitly.")
-#         pretrained_cfg = PretrainedCfg()  # instance with defaults
-
-#     pretrained_cfg_overlay = pretrained_cfg_overlay or {}
-#     if not pretrained_cfg.architecture:
-#         pretrained_cfg_overlay.setdefault('architecture', variant)
-#     pretrained_cfg = dataclasses.replace(pretrained_cfg, **pretrained_cfg_overlay)
-
-#     return pretrained_cfg
-
-
-# def build_model_with_cfg(
-#         model_cls: Callable,
-#         variant: str,
-#         pretrained: bool,
-#         pretrained_cfg: Optional[Dict] = None,
-#         pretrained_cfg_overlay: Optional[Dict] = None,
-#         model_cfg: Optional[Any] = None,
-#         feature_cfg: Optional[Dict] = None,
-#         pretrained_strict: bool = True,
-#         pretrained_filter_fn: Optional[Callable] = None,
-#         kwargs_filter: Optional[Tuple[str]] = None,
-#         **kwargs,
-# ):
-#     """ Build model with specified default_cfg and optional model_cfg
-
-#     This helper fn aids in the construction of a model including:
-#       * handling default_cfg and associated pretrained weight loading
-#       * passing through optional model_cfg for models with config based arch spec
-#       * features_only model adaptation
-#       * pruning config / model adaptation
-
-#     Args:
-#         model_cls: model class
-#         variant: model variant name
-#         pretrained: load pretrained weights
-#         pretrained_cfg: model's pretrained weight/task config
-#         model_cfg: model's architecture config
-#         feature_cfg: feature extraction adapter config
-#         pretrained_strict: load pretrained weights strictly
-#         pretrained_filter_fn: filter callable for pretrained weights
-#         kwargs_filter: kwargs to filter before passing to model
-#         **kwargs: model args passed through to model __init__
-#     """
-#     pruned = kwargs.pop('pruned', False)
-#     features = False
-#     feature_cfg = feature_cfg or {}
-
-#     # resolve and update model pretrained config and model kwargs
-#     pretrained_cfg = resolve_pretrained_cfg(
-#         variant,
-#         pretrained_cfg=pretrained_cfg,
-#         pretrained_cfg_overlay=pretrained_cfg_overlay
-#     )
-
-#     # FIXME converting back to dict, PretrainedCfg use should be propagated further, but not into model
-#     pretrained_cfg = pretrained_cfg.to_dict()
-
-#     _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter)
-
-#     # Setup for feature extraction wrapper done at end of this fn
-#     if kwargs.pop('features_only', False):
-#         features = True
-#         feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4))
-#         if 'out_indices' in kwargs:
-#             feature_cfg['out_indices'] = kwargs.pop('out_indices')
-#         if 'feature_cls' in kwargs:
-#             feature_cfg['feature_cls'] = kwargs.pop('feature_cls')
-
-#     # Instantiate the model
-#     if model_cfg is None:
-#         model = model_cls(**kwargs)
-#     else:
-#         model = model_cls(cfg=model_cfg, **kwargs)
-#     model.pretrained_cfg = pretrained_cfg
-#     model.default_cfg = model.pretrained_cfg  # alias for backwards compat
-
-#     if pruned:
-#         model = adapt_model_from_file(model, variant)
-
-#     # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
-#     num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
-#     if pretrained:
-#         load_pretrained(
-#             model,
-#             pretrained_cfg=pretrained_cfg,
-#             num_classes=num_classes_pretrained,
-#             in_chans=kwargs.get('in_chans', 3),
-#             filter_fn=pretrained_filter_fn,
-#             strict=pretrained_strict,
-#         )
-
-#     # Wrap the model in a feature extraction module if enabled
-#     if features:
-#         use_getter = False
-#         if 'feature_cls' in feature_cfg:
-#             feature_cls = feature_cfg.pop('feature_cls')
-#             if isinstance(feature_cls, str):
-#                 feature_cls = feature_cls.lower()
-
-#                 # flatten_sequential only valid for some feature extractors
-#                 if feature_cls not in ('dict', 'list', 'hook'):
-#                     feature_cfg.pop('flatten_sequential', None)
-
-#                 if 'hook' in feature_cls:
-#                     feature_cls = FeatureHookNet
-#                 elif feature_cls == 'list':
-#                     feature_cls = FeatureListNet
-#                 elif feature_cls == 'dict':
-#                     feature_cls = FeatureDictNet
-#                 elif feature_cls == 'fx':
-#                     feature_cls = FeatureGraphNet
-#                 elif feature_cls == 'getter':
-#                     use_getter = True
-#                     feature_cls = FeatureGetterNet
-#                 else:
-#                     assert False, f'Unknown feature class {feature_cls}'
-#         else:
-#             feature_cls = FeatureListNet
-
-#         output_fmt = getattr(model, 'output_fmt', None)
-#         if output_fmt is not None and not use_getter:  # don't set default for intermediate feat getter
-#             feature_cfg.setdefault('output_fmt', output_fmt)
-
-#         model = feature_cls(model, **feature_cfg)
-#         model.pretrained_cfg = pretrained_cfg_for_features(pretrained_cfg)  # add back pretrained cfg
-#         model.default_cfg = model.pretrained_cfg  # alias for rename backwards compat (default_cfg -> pretrained_cfg)
-
-#     return model
diff --git a/mindnlp/mimm/models/_efficientnet_blocks.py b/mindnlp/mimm/models/_efficientnet_blocks.py
deleted file mode 100644
index e88a3dad5..000000000
--- a/mindnlp/mimm/models/_efficientnet_blocks.py
+++ /dev/null
@@ -1,701 +0,0 @@
-""" EfficientNet, MobileNetV3, etc Blocks
-
-Hacked together by / Copyright 2019, Ross Wightman
-"""
-from typing import Callable, Dict, Optional, Type
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-
-from mindnlp.mimm.layers import create_conv2d, DropPath, make_divisible, create_act_layer, create_aa, to_2tuple, LayerType,\
-    ConvNormAct, get_norm_act_layer, MultiQueryAttention2d, Attention2d
-
-__all__ = [
-    'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual',
-    'UniversalInvertedResidual', 'MobileAttention'
-]
-
-ModuleType = Type[nn.Module]
-
-
-def num_groups(group_size: Optional[int], channels: int):
-    if not group_size:  # 0 or None
-        return 1  # normal conv with 1 group
-    else:
-        # NOTE group_size == 1 -> depthwise conv
-        assert channels % group_size == 0
-        return channels // group_size
-
-
-class SqueezeExcite(nn.Module):
-    """ Squeeze-and-Excitation w/ specific features for EfficientNet/MobileNet family
-
-    Args:
-        in_chs (int): input channels to layer
-        rd_ratio (float): ratio of squeeze reduction
-        act_layer (nn.Module): activation layer of containing block
-        gate_layer (Callable): attention gate function
-        force_act_layer (nn.Module): override block's activation fn if this is set/bound
-        rd_round_fn (Callable): specify a fn to calculate rounding of reduced chs
-    """
-
-    def __init__(
-            self,
-            in_chs: int,
-            rd_ratio: float = 0.25,
-            rd_channels: Optional[int] = None,
-            act_layer: LayerType = nn.ReLU,
-            gate_layer: LayerType = nn.Sigmoid,
-            force_act_layer: Optional[LayerType] = None,
-            rd_round_fn: Optional[Callable] = None,
-    ):
-        super(SqueezeExcite, self).__init__()
-        if rd_channels is None:
-            rd_round_fn = rd_round_fn or round
-            rd_channels = rd_round_fn(in_chs * rd_ratio)
-        act_layer = force_act_layer or act_layer
-        self.conv_reduce = nn.Conv2d(in_chs, rd_channels, 1, bias=True)
-        self.act1 = create_act_layer(act_layer, inplace=True)
-        self.conv_expand = nn.Conv2d(rd_channels, in_chs, 1, bias=True)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.conv_reduce(x_se)
-        x_se = self.act1(x_se)
-        x_se = self.conv_expand(x_se)
-        return x * self.gate(x_se)
-
-
-class ConvBnAct(nn.Module):
-    """ Conv + Norm Layer + Activation w/ optional skip connection
-    """
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            kernel_size: int,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 0,
-            pad_type: str = '',
-            skip: bool = False,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            drop_path_rate: float = 0.,
-    ):
-        super(ConvBnAct, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        groups = num_groups(group_size, in_chs)
-        self.has_skip = skip and stride == 1 and in_chs == out_chs
-        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
-
-        self.conv = create_conv2d(
-            in_chs, out_chs, kernel_size,
-            stride=1 if use_aa else stride,
-            dilation=dilation, groups=groups, padding=pad_type)
-        self.bn1 = norm_act_layer(out_chs, inplace=True)
-        self.aa = create_aa(aa_layer, channels=out_chs, stride=stride, enable=use_aa)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # output of conv after act, same as block coutput
-            return dict(module='bn1', hook_type='forward', num_chs=self.conv.out_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv.out_channels)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.conv(x)
-        x = self.bn1(x)
-        x = self.aa(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
-
-
-class DepthwiseSeparableConv(nn.Module):
-    """ Depthwise-separable block
-    Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion
-    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
-    """
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            dw_kernel_size: int = 3,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 1,
-            pad_type: str = '',
-            noskip: bool = False,
-            pw_kernel_size: int = 1,
-            pw_act: bool = False,
-            s2d: int = 0,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            se_layer: Optional[ModuleType] = None,
-            drop_path_rate: float = 0.,
-    ):
-        super(DepthwiseSeparableConv, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip
-        self.has_pw_act = pw_act  # activation after point-wise conv
-        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
-
-        # Space to depth
-        if s2d == 1:
-            sd_chs = int(in_chs * 4)
-            self.conv_s2d = create_conv2d(in_chs, sd_chs, kernel_size=2, stride=2, padding='same')
-            self.bn_s2d = norm_act_layer(sd_chs, sd_chs)
-            dw_kernel_size = (dw_kernel_size + 1) // 2
-            dw_pad_type = 'same' if dw_kernel_size == 2 else pad_type
-            in_chs = sd_chs
-            use_aa = False  # disable AA
-        else:
-            self.conv_s2d = None
-            self.bn_s2d = None
-            dw_pad_type = pad_type
-
-        groups = num_groups(group_size, in_chs)
-
-        self.conv_dw = create_conv2d(
-            in_chs, in_chs, dw_kernel_size,
-            stride=1 if use_aa else stride,
-            dilation=dilation, padding=dw_pad_type, groups=groups)
-        self.bn1 = norm_act_layer(in_chs, inplace=True)
-        self.aa = create_aa(aa_layer, channels=out_chs, stride=stride, enable=use_aa)
-
-        # Squeeze-and-excitation
-        self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity()
-
-        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_act_layer(out_chs, inplace=True, apply_act=self.has_pw_act)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PW
-            return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv_pw.out_channels)
-
-    def forward(self, x):
-        shortcut = x
-        if self.conv_s2d is not None:
-            x = self.conv_s2d(x)
-            x = self.bn_s2d(x)
-        x = self.conv_dw(x)
-        x = self.bn1(x)
-        x = self.aa(x)
-        x = self.se(x)
-        x = self.conv_pw(x)
-        x = self.bn2(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
-
-
-class InvertedResidual(nn.Module):
-    """ Inverted residual block w/ optional SE
-
-    Originally used in MobileNet-V2 - https://arxiv.org/abs/1801.04381v4, this layer is often
-    referred to as 'MBConv' for (Mobile inverted bottleneck conv) and is also used in
-      * MNasNet - https://arxiv.org/abs/1807.11626
-      * EfficientNet - https://arxiv.org/abs/1905.11946
-      * MobileNet-V3 - https://arxiv.org/abs/1905.02244
-    """
-
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            dw_kernel_size: int = 3,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 1,
-            pad_type: str = '',
-            noskip: bool = False,
-            exp_ratio: float = 1.0,
-            exp_kernel_size: int = 1,
-            pw_kernel_size: int = 1,
-            s2d: int = 0,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            se_layer: Optional[ModuleType] = None,
-            conv_kwargs: Optional[Dict] = None,
-            drop_path_rate: float = 0.,
-    ):
-        super(InvertedResidual, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        conv_kwargs = conv_kwargs or {}
-        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
-        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
-
-        # Space to depth
-        if s2d == 1:
-            sd_chs = int(in_chs * 4)
-            self.conv_s2d = create_conv2d(in_chs, sd_chs, kernel_size=2, stride=2, padding='same')
-            self.bn_s2d = norm_act_layer(sd_chs, sd_chs)
-            dw_kernel_size = (dw_kernel_size + 1) // 2
-            dw_pad_type = 'same' if dw_kernel_size == 2 else pad_type
-            in_chs = sd_chs
-            use_aa = False  # disable AA
-        else:
-            self.conv_s2d = None
-            self.bn_s2d = None
-            dw_pad_type = pad_type
-
-        mid_chs = make_divisible(in_chs * exp_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        # Point-wise expansion
-        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn1 = norm_act_layer(mid_chs, inplace=True)
-
-        # Depth-wise convolution
-        self.conv_dw = create_conv2d(
-            mid_chs, mid_chs, dw_kernel_size,
-            stride=1 if use_aa else stride,
-            dilation=dilation, groups=groups, padding=dw_pad_type, **conv_kwargs)
-        self.bn2 = norm_act_layer(mid_chs, inplace=True)
-        self.aa = create_aa(aa_layer, channels=mid_chs, stride=stride, enable=use_aa)
-
-        # Squeeze-and-excitation
-        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
-
-        # Point-wise linear projection
-        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn3 = norm_act_layer(out_chs, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PWL
-            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv_pwl.out_channels)
-
-    def forward(self, x):
-        shortcut = x
-        if self.conv_s2d is not None:
-            x = self.conv_s2d(x)
-            x = self.bn_s2d(x)
-        x = self.conv_pw(x)
-        x = self.bn1(x)
-        x = self.conv_dw(x)
-        x = self.bn2(x)
-        x = self.aa(x)
-        x = self.se(x)
-        x = self.conv_pwl(x)
-        x = self.bn3(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
-
-
-class LayerScale2d(nn.Module):
-    def __init__(self, dim: int, init_values: float = 1e-5, inplace: bool = False):
-        super().__init__()
-        self.inplace = inplace
-        self.gamma = nn.Parameter(init_values * ops.ones(dim))
-
-    def forward(self, x):
-        gamma = self.gamma.view(1, -1, 1, 1)
-        return x.mul_(gamma) if self.inplace else x * gamma
-
-
-class UniversalInvertedResidual(nn.Module):
-    """ Universal Inverted Residual Block (aka Universal Inverted Bottleneck, UIB)
-
-    For MobileNetV4 - https://arxiv.org/abs/, referenced from
-    https://github.com/tensorflow/models/blob/d93c7e932de27522b2fa3b115f58d06d6f640537/official/vision/modeling/layers/nn_blocks.py#L778
-    """
-
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            dw_kernel_size_start: int = 0,
-            dw_kernel_size_mid: int = 3,
-            dw_kernel_size_end: int = 0,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 1,
-            pad_type: str = '',
-            noskip: bool = False,
-            exp_ratio: float = 1.0,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            se_layer: Optional[ModuleType] = None,
-            conv_kwargs: Optional[Dict] = None,
-            drop_path_rate: float = 0.,
-            layer_scale_init_value: Optional[float] = 1e-5,
-    ):
-        super(UniversalInvertedResidual, self).__init__()
-        conv_kwargs = conv_kwargs or {}
-        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
-        if stride > 1:
-            assert dw_kernel_size_start or dw_kernel_size_mid or dw_kernel_size_end
-
-        # FIXME dilation isn't right w/ extra ks > 1 convs
-        if dw_kernel_size_start:
-            dw_start_stride = stride if not dw_kernel_size_mid else 1
-            dw_start_groups = num_groups(group_size, in_chs)
-            self.dw_start = ConvNormAct(
-                in_chs, in_chs, dw_kernel_size_start,
-                stride=dw_start_stride,
-                dilation=dilation,  # FIXME
-                groups=dw_start_groups,
-                padding=pad_type,
-                apply_act=False,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                aa_layer=aa_layer,
-                **conv_kwargs,
-            )
-        else:
-            self.dw_start = nn.Identity()
-
-        # Point-wise expansion
-        mid_chs = make_divisible(in_chs * exp_ratio)
-        self.pw_exp = ConvNormAct(
-            in_chs, mid_chs, 1,
-            padding=pad_type,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-            **conv_kwargs,
-        )
-
-        # Middle depth-wise convolution
-        if dw_kernel_size_mid:
-            groups = num_groups(group_size, mid_chs)
-            self.dw_mid = ConvNormAct(
-                mid_chs, mid_chs, dw_kernel_size_mid,
-                stride=stride,
-                dilation=dilation,  # FIXME
-                groups=groups,
-                padding=pad_type,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                aa_layer=aa_layer,
-                **conv_kwargs,
-            )
-        else:
-            # keeping mid as identity so it can be hooked more easily for features
-            self.dw_mid = nn.Identity()
-
-        # Squeeze-and-excitation
-        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
-
-        # Point-wise linear projection
-        self.pw_proj = ConvNormAct(
-            mid_chs, out_chs, 1,
-            padding=pad_type,
-            apply_act=False,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-            **conv_kwargs,
-        )
-
-        if dw_kernel_size_end:
-            dw_end_stride = stride if not dw_kernel_size_start and not dw_kernel_size_mid else 1
-            dw_end_groups = num_groups(group_size, out_chs)
-            if dw_end_stride > 1:
-                assert not aa_layer
-            self.dw_end = ConvNormAct(
-                out_chs, out_chs, dw_kernel_size_end,
-                stride=dw_end_stride,
-                dilation=dilation,
-                groups=dw_end_groups,
-                padding=pad_type,
-                apply_act=False,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                **conv_kwargs,
-            )
-        else:
-            self.dw_end = nn.Identity()
-
-        if layer_scale_init_value is not None:
-            self.layer_scale = LayerScale2d(out_chs, layer_scale_init_value)
-        else:
-            self.layer_scale = nn.Identity()
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PWL
-            return dict(module='pw_proj.conv', hook_type='forward_pre', num_chs=self.pw_proj.conv.in_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.pw_proj.conv.out_channels)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.dw_start(x)
-        x = self.pw_exp(x)
-        x = self.dw_mid(x)
-        x = self.se(x)
-        x = self.pw_proj(x)
-        x = self.dw_end(x)
-        x = self.layer_scale(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
-
-
-class MobileAttention(nn.Module):
-    """ Mobile Attention Block
-
-    For MobileNetV4 - https://arxiv.org/abs/, referenced from
-    https://github.com/tensorflow/models/blob/d93c7e932de27522b2fa3b115f58d06d6f640537/official/vision/modeling/layers/nn_blocks.py#L1504
-    """
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            stride: int = 1,
-            dw_kernel_size: int = 3,
-            dilation: int = 1,
-            group_size: int = 1,
-            pad_type: str = '',
-            num_heads: int = 8,
-            key_dim: int = 64,
-            value_dim: int = 64,
-            use_multi_query: bool = False,
-            query_strides: int = (1, 1),
-            kv_stride: int = 1,
-            cpe_dw_kernel_size: int = 3,
-            noskip: bool = False,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            drop_path_rate: float = 0.,
-            attn_drop: float = 0.0,
-            proj_drop: float = 0.0,
-            layer_scale_init_value: Optional[float] = 1e-5,
-            use_bias: bool = False,
-            use_cpe: bool = False,
-    ):
-        super(MobileAttention, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip
-        self.query_strides = to_2tuple(query_strides)
-        self.kv_stride = kv_stride
-        self.has_query_stride = any([s > 1 for s in self.query_strides])
-
-        # This CPE is different than the one suggested in the original paper.
-        # https://arxiv.org/abs/2102.10882
-        # 1. Rather than adding one CPE before the attention blocks, we add a CPE
-        #    into every attention block.
-        # 2. We replace the expensive Conv2D by a Seperable DW Conv.
-        if use_cpe:
-            self.conv_cpe_dw = create_conv2d(
-                in_chs, in_chs,
-                kernel_size=cpe_dw_kernel_size,
-                dilation=dilation,
-                depthwise=True,
-                bias=True,
-            )
-        else:
-            self.conv_cpe_dw = None
-
-        self.norm = norm_act_layer(in_chs, apply_act=False)
-
-        if num_heads is None:
-            assert in_chs % key_dim == 0
-            num_heads = in_chs // key_dim
-
-        if use_multi_query:
-            self.attn = MultiQueryAttention2d(
-                in_chs,
-                dim_out=out_chs,
-                num_heads=num_heads,
-                key_dim=key_dim,
-                value_dim=value_dim,
-                query_strides=query_strides,
-                kv_stride=kv_stride,
-                dilation=dilation,
-                padding=pad_type,
-                dw_kernel_size=dw_kernel_size,
-                attn_drop=attn_drop,
-                proj_drop=proj_drop,
-                #bias=use_bias, # why not here if used w/ mhsa?
-            )
-        else:
-            self.attn = Attention2d(
-                in_chs,
-                dim_out=out_chs,
-                num_heads=num_heads,
-                attn_drop=attn_drop,
-                proj_drop=proj_drop,
-                bias=use_bias,
-            )
-
-        if layer_scale_init_value is not None:
-            self.layer_scale = LayerScale2d(out_chs, layer_scale_init_value)
-        else:
-            self.layer_scale = nn.Identity()
-
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PW
-            return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv_pw.out_channels)
-
-    def forward(self, x):
-        if self.conv_cpe_dw is not None:
-            x_cpe = self.conv_cpe_dw(x)
-            x = x + x_cpe
-
-        shortcut = x
-        x = self.norm(x)
-        x = self.attn(x)
-        x = self.layer_scale(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-
-        return x
-
-
-class CondConvResidual(InvertedResidual):
-    """ Inverted residual block w/ CondConv routing"""
-
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            dw_kernel_size: int = 3,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 1,
-            pad_type: str = '',
-            noskip: bool = False,
-            exp_ratio: float = 1.0,
-            exp_kernel_size: int = 1,
-            pw_kernel_size: int = 1,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            se_layer: Optional[ModuleType] = None,
-            num_experts: int = 0,
-            drop_path_rate: float = 0.,
-    ):
-
-        self.num_experts = num_experts
-        conv_kwargs = dict(num_experts=self.num_experts)
-        super(CondConvResidual, self).__init__(
-            in_chs,
-            out_chs,
-            dw_kernel_size=dw_kernel_size,
-            stride=stride,
-            dilation=dilation,
-            group_size=group_size,
-            pad_type=pad_type,
-            noskip=noskip,
-            exp_ratio=exp_ratio,
-            exp_kernel_size=exp_kernel_size,
-            pw_kernel_size=pw_kernel_size,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-            aa_layer=aa_layer,
-            se_layer=se_layer,
-            conv_kwargs=conv_kwargs,
-            drop_path_rate=drop_path_rate,
-        )
-        self.routing_fn = nn.Linear(in_chs, self.num_experts)
-
-    def forward(self, x):
-        shortcut = x
-        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)  # CondConv routing
-        routing_weights = ops.sigmoid(self.routing_fn(pooled_inputs))
-        x = self.conv_pw(x, routing_weights)
-        x = self.bn1(x)
-        x = self.conv_dw(x, routing_weights)
-        x = self.bn2(x)
-        x = self.se(x)
-        x = self.conv_pwl(x, routing_weights)
-        x = self.bn3(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
-
-
-class EdgeResidual(nn.Module):
-    """ Residual block with expansion convolution followed by pointwise-linear w/ stride
-
-    Originally introduced in `EfficientNet-EdgeTPU: Creating Accelerator-Optimized Neural Networks with AutoML`
-        - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html
-
-    This layer is also called FusedMBConv in the MobileDet, EfficientNet-X, and EfficientNet-V2 papers
-      * MobileDet - https://arxiv.org/abs/2004.14525
-      * EfficientNet-X - https://arxiv.org/abs/2102.05610
-      * EfficientNet-V2 - https://arxiv.org/abs/2104.00298
-    """
-
-    def __init__(
-            self,
-            in_chs: int,
-            out_chs: int,
-            exp_kernel_size: int = 3,
-            stride: int = 1,
-            dilation: int = 1,
-            group_size: int = 0,
-            pad_type: str = '',
-            force_in_chs: int = 0,
-            noskip: bool = False,
-            exp_ratio: float = 1.0,
-            pw_kernel_size:  int = 1,
-            act_layer: LayerType = nn.ReLU,
-            norm_layer: LayerType = nn.BatchNorm2d,
-            aa_layer: Optional[LayerType] = None,
-            se_layer: Optional[ModuleType] = None,
-            drop_path_rate: float = 0.,
-    ):
-        super(EdgeResidual, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        if force_in_chs > 0:
-            mid_chs = make_divisible(force_in_chs * exp_ratio)
-        else:
-            mid_chs = make_divisible(in_chs * exp_ratio)
-        groups = num_groups(group_size, mid_chs)  # NOTE: Using out_chs of conv_exp for groups calc
-        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
-        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
-
-        # Expansion convolution
-        self.conv_exp = create_conv2d(
-            in_chs, mid_chs, exp_kernel_size,
-            stride=1 if use_aa else stride,
-            dilation=dilation, groups=groups, padding=pad_type)
-        self.bn1 = norm_act_layer(mid_chs, inplace=True)
-
-        self.aa = create_aa(aa_layer, channels=mid_chs, stride=stride, enable=use_aa)
-
-        # Squeeze-and-excitation
-        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
-
-        # Point-wise linear projection
-        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_act_layer(out_chs, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, before PWL
-            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
-        else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv_pwl.out_channels)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.conv_exp(x)
-        x = self.bn1(x)
-        x = self.aa(x)
-        x = self.se(x)
-        x = self.conv_pwl(x)
-        x = self.bn2(x)
-        if self.has_skip:
-            x = self.drop_path(x) + shortcut
-        return x
diff --git a/mindnlp/mimm/models/beit.py b/mindnlp/mimm/models/beit.py
deleted file mode 100644
index b306039d6..000000000
--- a/mindnlp/mimm/models/beit.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# """ BEiT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
-
-# Model from official source: https://github.com/microsoft/unilm/tree/master/beit
-
-# @inproceedings{beit,
-# title={{BEiT}: {BERT} Pre-Training of Image Transformers},
-# author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
-# booktitle={International Conference on Learning Representations},
-# year={2022},
-# url={https://openreview.net/forum?id=p-BhZSz59o4}
-# }
-
-# BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2
-
-# @article{beitv2,
-# title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
-# author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
-# year={2022},
-# eprint={2208.06366},
-# archivePrefix={arXiv},
-# primaryClass={cs.CV}
-# }
-
-# At this point only the 1k fine-tuned classification weights and model configs have been added,
-# see original source above for pre-training models and procedure.
-
-# Modifications by / Copyright 2021 Ross Wightman, original copyrights below
-# """
-# # --------------------------------------------------------
-# # BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
-# # Github source: https://github.com/microsoft/unilm/tree/master/beit
-# # Copyright (c) 2021 Microsoft
-# # Licensed under The MIT License [see LICENSE for details]
-# # By Hangbo Bao
-# # Based on timm and DeiT code bases
-# # https://github.com/rwightman/pytorch-image-models/tree/master/timm
-# # https://github.com/facebookresearch/deit/
-# # https://github.com/facebookresearch/dino
-# # --------------------------------------------------------'
-# # pylint: disable=use-dict-literal
-# import math
-# from typing import Callable, List, Optional, Tuple, Union
-
-# import mindspore
-# from mindnlp.core import nn, ops
-# from mindnlp.core.nn import functional as F
-
-# from mindnlp.configs import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-# from mindnlp.mimm.layers import PatchEmbed, Mlp, SwiGLU, LayerNorm, DropPath, trunc_normal_
-# from mindnlp.mimm.layers import resample_patch_embed, resample_abs_pos_embed, resize_rel_pos_bias_table, ndgrid
-
-
-# from ._builder import build_model_with_cfg
-# from ._features import feature_take_indices
-# from ._registry import generate_default_cfgs, register_model
-
-# __all__ = ['Beit']
-
-
-# def gen_relative_position_index(window_size: Tuple[int, int]) -> mindspore.Tensor:
-#     num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
-#     # cls to token & token 2 cls & cls to cls
-#     # get pair-wise relative position index for each token inside the window
-#     window_area = window_size[0] * window_size[1]
-#     coords = ops.stack(ndgrid(ops.arange(window_size[0]), ops.arange(window_size[1])))  # 2, Wh, Ww
-#     coords_flatten = ops.flatten(coords, 1)  # 2, Wh*Ww
-#     relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-#     relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-#     relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
-#     relative_coords[:, :, 1] += window_size[1] - 1
-#     relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-#     relative_position_index = ops.zeros((window_area + 1,) * 2, dtype=relative_coords.dtype)
-#     relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-#     relative_position_index[0, 0:] = num_relative_distance - 3
-#     relative_position_index[0:, 0] = num_relative_distance - 2
-#     relative_position_index[0, 0] = num_relative_distance - 1
-#     return relative_position_index
-
-
-# class Attention(nn.Module):
-#     fused_attn: bool
-
-#     def __init__(
-#             self,
-#             dim: int,
-#             num_heads: int = 8,
-#             qkv_bias: bool = False,
-#             qkv_bias_separate: bool = False,
-#             attn_drop: float = 0.,
-#             proj_drop: float = 0.,
-#             window_size: Optional[Tuple[int, int]] = None,
-#             attn_head_dim: Optional[int] = None,
-#     ):
-#         super().__init__()
-#         self.num_heads = num_heads
-#         head_dim = dim // num_heads
-#         if attn_head_dim is not None:
-#             head_dim = attn_head_dim
-#         all_head_dim = head_dim * self.num_heads
-#         self.scale = head_dim ** -0.5
-#         self.qkv_bias_separate = qkv_bias_separate
-
-#         self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
-#         if qkv_bias:
-#             self.q_bias = nn.Parameter(ops.zeros(all_head_dim))
-#             self.register_buffer('k_bias', ops.zeros(all_head_dim), persistent=False)
-#             self.v_bias = nn.Parameter(ops.zeros(all_head_dim))
-#         else:
-#             self.q_bias = None
-#             self.k_bias = None
-#             self.v_bias = None
-
-#         if window_size:
-#             self.window_size = window_size
-#             self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
-#             self.relative_position_bias_table = nn.Parameter(
-#                 ops.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-#             self.register_buffer("relative_position_index", gen_relative_position_index(window_size), persistent=False)
-#         else:
-#             self.window_size = None
-#             self.relative_position_bias_table = None
-#             self.relative_position_index = None
-
-#         self.attn_drop = nn.Dropout(attn_drop)
-#         self.proj = nn.Linear(all_head_dim, dim)
-#         self.proj_drop = nn.Dropout(proj_drop)
-
-#     def _get_rel_pos_bias(self):
-#         relative_position_bias = self.relative_position_bias_table[
-#             self.relative_position_index.view(-1)].view(
-#             self.window_size[0] * self.window_size[1] + 1,
-#             self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
-#         relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-#         return relative_position_bias.unsqueeze(0)
-
-#     def forward(self, x, shared_rel_pos_bias: Optional[mindspore.Tensor] = None):
-#         B, N, C = x.shape
-
-#         if self.q_bias is None:
-#             qkv = self.qkv(x)
-#         else:
-#             qkv_bias = ops.cat((self.q_bias, self.k_bias, self.v_bias))
-#             if self.qkv_bias_separate:
-#                 qkv = self.qkv(x)
-#                 qkv += qkv_bias
-#             else:
-#                 qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
-#         qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-#         q, k, v = qkv.unbind(0)  # B, num_heads, N, head_dim
-
-#         q = q * self.scale
-#         attn = (q @ k.transpose(-2, -1))
-
-#         if self.relative_position_bias_table is not None:
-#             attn = attn + self._get_rel_pos_bias()
-#         if shared_rel_pos_bias is not None:
-#             attn = attn + shared_rel_pos_bias
-
-#         attn = attn.softmax(dim=-1)
-#         attn = self.attn_drop(attn)
-#         x = attn @ v
-
-#         x = x.transpose(1, 2).reshape(B, N, C)
-#         x = self.proj(x)
-#         x = self.proj_drop(x)
-#         return x
-
-
-# class Block(nn.Module):
-
-#     def __init__(
-#             self,
-#             dim: int,
-#             num_heads: int,
-#             qkv_bias: bool = False,
-#             mlp_ratio: float = 4.,
-#             scale_mlp: bool = False,
-#             swiglu_mlp: bool = False,
-#             proj_drop: float = 0.,
-#             attn_drop: float = 0.,
-#             drop_path: float = 0.,
-#             init_values: Optional[float] = None,
-#             act_layer: Callable = nn.GELU,
-#             norm_layer: Callable = LayerNorm,
-#             window_size: Optional[Tuple[int, int]] = None,
-#             attn_head_dim: Optional[int] = None,
-#     ):
-#         super().__init__()
-#         self.norm1 = norm_layer(dim)
-#         self.attn = Attention(
-#             dim,
-#             num_heads=num_heads,
-#             qkv_bias=qkv_bias,
-#             attn_drop=attn_drop,
-#             proj_drop=proj_drop,
-#             window_size=window_size,
-#             attn_head_dim=attn_head_dim,
-#         )
-#         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-#         self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-
-#         self.norm2 = norm_layer(dim)
-#         if swiglu_mlp:
-#             self.mlp = SwiGLU(
-#                 in_features=dim,
-#                 hidden_features=int(dim * mlp_ratio),
-#                 norm_layer=norm_layer if scale_mlp else None,
-#                 drop=proj_drop,
-#             )
-#         else:
-#             self.mlp = Mlp(
-#                 in_features=dim,
-#                 hidden_features=int(dim * mlp_ratio),
-#                 act_layer=act_layer,
-#                 norm_layer=norm_layer if scale_mlp else None,
-#                 drop=proj_drop,
-#             )
-#         self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-
-#         if init_values:
-#             self.gamma_1 = nn.Parameter(init_values * ops.ones(dim))
-#             self.gamma_2 = nn.Parameter(init_values * ops.ones(dim))
-#         else:
-#             self.gamma_1, self.gamma_2 = None, None
-
-#     def forward(self, x, shared_rel_pos_bias: Optional[mindspore.Tensor] = None):
-#         if self.gamma_1 is None:
-#             x = x + self.drop_path1(self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias))
-#             x = x + self.drop_path2(self.mlp(self.norm2(x)))
-#         else:
-#             x = x + self.drop_path1(self.gamma_1 * self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias))
-#             x = x + self.drop_path2(self.gamma_2 * self.mlp(self.norm2(x)))
-#         return x
-
-
-# class RelativePositionBias(nn.Module):
-
-#     def __init__(self, window_size, num_heads):
-#         super().__init__()
-#         self.window_size = window_size
-#         self.window_area = window_size[0] * window_size[1]
-#         num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
-#         self.relative_position_bias_table = nn.Parameter(ops.zeros(num_relative_distance, num_heads))
-#         # trunc_normal_(self.relative_position_bias_table, std=.02)
-#         self.register_buffer("relative_position_index", gen_relative_position_index(window_size))
-
-#     def forward(self):
-#         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-#             self.window_area + 1, self.window_area + 1, -1)  # Wh*Ww,Wh*Ww,nH
-#         return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-
-
-# class Beit(nn.Module):
-#     """ Vision Transformer with support for patch or hybrid CNN input stage
-#     """
-
-#     def __init__(
-#             self,
-#             img_size: Union[int, Tuple[int, int]] = 224,
-#             patch_size: Union[int, Tuple[int, int]] = 16,
-#             in_chans: int = 3,
-#             num_classes: int = 1000,
-#             global_pool: str = 'avg',
-#             embed_dim: int = 768,
-#             depth: int = 12,
-#             num_heads: int = 12,
-#             qkv_bias: bool = True,
-#             mlp_ratio: float = 4.,
-#             swiglu_mlp: bool = False,
-#             scale_mlp: bool = False,
-#             drop_rate: float = 0.,
-#             pos_drop_rate: float = 0.,
-#             proj_drop_rate: float = 0.,
-#             attn_drop_rate: float = 0.,
-#             drop_path_rate: float = 0.,
-#             norm_layer: Callable = LayerNorm,
-#             init_values: Optional[float] = None,
-#             use_abs_pos_emb: bool = True,
-#             use_rel_pos_bias: bool = False,
-#             use_shared_rel_pos_bias: bool = False,
-#             head_init_scale: float = 0.001,
-#     ):
-#         super().__init__()
-#         self.num_classes = num_classes
-#         self.global_pool = global_pool
-#         self.num_features = self.head_hidden_size = self.embed_dim = embed_dim  # for consistency with other models
-#         self.num_prefix_tokens = 1
-#         self.grad_checkpointing = False
-
-#         self.patch_embed = PatchEmbed(
-#             img_size=img_size,
-#             patch_size=patch_size,
-#             in_chans=in_chans,
-#             embed_dim=embed_dim,
-#         )
-#         num_patches = self.patch_embed.num_patches
-#         r = self.patch_embed.feat_ratio() if hasattr(self.patch_embed, 'feat_ratio') else patch_size
-
-#         self.cls_token = nn.Parameter(ops.zeros(1, 1, embed_dim))
-#         # self.mask_token = nn.Parameter(ops.zeros(1, 1, embed_dim))
-#         self.pos_embed = nn.Parameter(ops.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None
-#         self.pos_drop = nn.Dropout(p=pos_drop_rate)
-
-#         if use_shared_rel_pos_bias:
-#             self.rel_pos_bias = RelativePositionBias(
-#                 window_size=self.patch_embed.grid_size,
-#                 num_heads=num_heads,
-#             )
-#         else:
-#             self.rel_pos_bias = None
-
-#         dpr = [x.item() for x in ops.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
-#         self.blocks = nn.ModuleList([
-#             Block(
-#                 dim=embed_dim,
-#                 num_heads=num_heads,
-#                 qkv_bias=qkv_bias,
-#                 mlp_ratio=mlp_ratio,
-#                 scale_mlp=scale_mlp,
-#                 swiglu_mlp=swiglu_mlp,
-#                 proj_drop=proj_drop_rate,
-#                 attn_drop=attn_drop_rate,
-#                 drop_path=dpr[i],
-#                 norm_layer=norm_layer,
-#                 init_values=init_values,
-#                 window_size=self.patch_embed.grid_size if use_rel_pos_bias else None,
-#             )
-#             for i in range(depth)])
-#         self.feature_info = [
-#             dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=r) for i in range(depth)]
-
-#         use_fc_norm = self.global_pool == 'avg'
-#         self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim)
-#         self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
-#         self.head_drop = nn.Dropout(drop_rate)
-#         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-#         self.apply(self._init_weights)
-#         if self.pos_embed is not None:
-#             trunc_normal_(self.pos_embed, std=.02)
-#         trunc_normal_(self.cls_token, std=.02)
-
-#         self.fix_init_weight()
-#         if isinstance(self.head, nn.Linear):
-#             trunc_normal_(self.head.weight, std=.02)
-#             self.head.weight.data.mul_(head_init_scale)
-#             self.head.bias.data.mul_(head_init_scale)
-
-#     def fix_init_weight(self):
-#         def rescale(param, layer_id):
-#             param.div_(math.sqrt(2.0 * layer_id))
-
-#         for layer_id, layer in enumerate(self.blocks):
-#             rescale(layer.attn.proj.weight.data, layer_id + 1)
-#             rescale(layer.mlp.fc2.weight.data, layer_id + 1)
-
-#     def _init_weights(self, m):
-#         if isinstance(m, nn.Linear):
-#             trunc_normal_(m.weight, std=.02)
-#             if isinstance(m, nn.Linear) and m.bias is not None:
-#                 nn.init.constant_(m.bias, 0)
-#         elif isinstance(m, nn.LayerNorm):
-#             nn.init.constant_(m.bias, 0)
-#             nn.init.constant_(m.weight, 1.0)
-
-#     def no_weight_decay(self):
-#         nwd = {'pos_embed', 'cls_token'}
-#         for n, _ in self.named_parameters():
-#             if 'relative_position_bias_table' in n:
-#                 nwd.add(n)
-#         return nwd
-
-#     def set_grad_checkpointing(self, enable=True):
-#         self.grad_checkpointing = enable
-
-#     def group_matcher(self, coarse=False):
-#         matcher = dict(
-#             stem=r'^cls_token|pos_embed|patch_embed|rel_pos_bias',  # stem and embed
-#             blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))],
-#         )
-#         return matcher
-
-#     def get_classifier(self) -> nn.Module:
-#         return self.head
-
-#     def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None):
-#         self.num_classes = num_classes
-#         if global_pool is not None:
-#             self.global_pool = global_pool
-#         self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-#     def forward_intermediates(
-#             self,
-#             x: mindspore.Tensor,
-#             indices: Optional[Union[int, List[int]]] = None,
-#             return_prefix_tokens: bool = False,
-#             norm: bool = False,
-#             stop_early: bool = False,
-#             output_fmt: str = 'NCHW',
-#             intermediates_only: bool = False,
-#     ) -> Union[List[mindspore.Tensor], Tuple[mindspore.Tensor, List[mindspore.Tensor]]]:
-#         """ Forward features that returns intermediates.
-
-#         Args:
-#             x: Input image tensor
-#             indices: Take last n blocks if an int, if is a sequence, select by matching indices
-#             return_prefix_tokens: Return both prefix and spatial intermediate tokens
-#             norm: Apply norm layer to all intermediates
-#             stop_early: Stop iterating over blocks when last desired intermediate hit
-#             output_fmt: Shape of intermediate feature outputs
-#             intermediates_only: Only return intermediate features
-#         Returns:
-
-#         """
-#         assert output_fmt in ('NCHW', 'NLC'), 'Output format must be one of NCHW or NLC.'
-#         reshape = output_fmt == 'NCHW'
-#         intermediates = []
-#         take_indices, max_index = feature_take_indices(len(self.blocks), indices)
-
-#         # forward pass
-#         B, _, height, width = x.shape
-#         x = self.patch_embed(x)
-#         x = ops.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
-#         if self.pos_embed is not None:
-#             x = x + self.pos_embed
-#         x = self.pos_drop(x)
-
-#         rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
-#         if not stop_early:  # can't slice blocks in torchscript
-#             blocks = self.blocks
-#         else:
-#             blocks = self.blocks[:max_index + 1]
-#         for i, blk in enumerate(blocks):
-#             x = blk(x, shared_rel_pos_bias=rel_pos_bias)
-#             if i in take_indices:
-#                 # normalize intermediates with final norm layer if enabled
-#                 intermediates.append(self.norm(x) if norm else x)
-
-#         # process intermediates
-#         if self.num_prefix_tokens:
-#             # split prefix (e.g. class, distill) and spatial feature tokens
-#             prefix_tokens = [y[:, 0:self.num_prefix_tokens] for y in intermediates]
-#             intermediates = [y[:, self.num_prefix_tokens:] for y in intermediates]
-#         if reshape:
-#             # reshape to BCHW output format
-#             H, W = self.patch_embed.dynamic_feat_size((height, width))
-#             intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates]
-
-#         if intermediates_only:
-#             return intermediates
-
-#         x = self.norm(x)
-
-#         return x, intermediates
-
-#     def prune_intermediate_layers(
-#             self,
-#             indices: Union[int, List[int]] = 1,
-#             prune_norm: bool = False,
-#             prune_head: bool = True,
-#     ):
-#         """ Prune layers not required for specified intermediates.
-#         """
-#         take_indices, max_index = feature_take_indices(len(self.blocks), indices)
-#         self.blocks = self.blocks[:max_index + 1]  # truncate blocks
-#         if prune_norm:
-#             self.norm = nn.Identity()
-#         if prune_head:
-#             self.fc_norm = nn.Identity()
-#             self.reset_classifier(0, '')
-#         return take_indices
-
-#     def forward_features(self, x):
-#         x = self.patch_embed(x)
-#         x = ops.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
-#         if self.pos_embed is not None:
-#             x = x + self.pos_embed
-#         x = self.pos_drop(x)
-
-#         rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
-#         for blk in self.blocks:
-#             if self.grad_checkpointing:
-#                 x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
-#             else:
-#                 x = blk(x, shared_rel_pos_bias=rel_pos_bias)
-#         x = self.norm(x)
-#         return x
-
-#     def forward_head(self, x, pre_logits: bool = False):
-#         if self.global_pool:
-#             x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
-#         x = self.fc_norm(x)
-#         x = self.head_drop(x)
-#         return x if pre_logits else self.head(x)
-
-#     def forward(self, x):
-#         x = self.forward_features(x)
-#         x = self.forward_head(x)
-#         return x
-
-
-# def _cfg(url='', **kwargs):
-#     return {
-#         'url': url,
-#         'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-#         'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
-#         'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
-#         'first_conv': 'patch_embed.proj', 'classifier': 'head',
-#         **kwargs
-#     }
-
-
-# default_cfgs = generate_default_cfgs({
-#     'beit_base_patch16_224.in22k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth',
-#         hf_hub_id='timm/'),
-#     'beit_base_patch16_384.in22k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth',
-#         hf_hub_id='timm/',
-#         input_size=(3, 384, 384), crop_pct=1.0,
-#     ),
-#     'beit_base_patch16_224.in22k_ft_in22k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth',
-#         hf_hub_id='timm/',
-#         num_classes=21841,
-#     ),
-#     'beit_large_patch16_224.in22k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth',
-#         hf_hub_id='timm/'),
-#     'beit_large_patch16_384.in22k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth',
-#         hf_hub_id='timm/',
-#         input_size=(3, 384, 384), crop_pct=1.0,
-#     ),
-#     'beit_large_patch16_512.in22k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth',
-#         hf_hub_id='timm/',
-#         input_size=(3, 512, 512), crop_pct=1.0,
-#     ),
-#     'beit_large_patch16_224.in22k_ft_in22k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth',
-#         hf_hub_id='timm/',
-#         num_classes=21841,
-#     ),
-
-#     'beitv2_base_patch16_224.in1k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth',
-#         hf_hub_id='timm/',
-#         mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-#     'beitv2_base_patch16_224.in1k_ft_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft1k.pth',
-#         hf_hub_id='timm/',
-#         mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-#     'beitv2_base_patch16_224.in1k_ft_in22k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth',
-#         hf_hub_id='timm/',
-#         num_classes=21841, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-#     'beitv2_large_patch16_224.in1k_ft_in22k_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth',
-#         hf_hub_id='timm/',
-#         crop_pct=0.95, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-#     'beitv2_large_patch16_224.in1k_ft_in1k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft1k.pth',
-#         hf_hub_id='timm/',
-#         crop_pct=0.95, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-#     'beitv2_large_patch16_224.in1k_ft_in22k': _cfg(
-#         #url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth',
-#         hf_hub_id='timm/',
-#         num_classes=21841, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
-#     ),
-# })
-
-
-# def checkpoint_filter_fn(state_dict, model, interpolation='bicubic', antialias=True):
-#     state_dict = state_dict.get('model', state_dict)
-#     state_dict = state_dict.get('module', state_dict)
-#     # beit v2 didn't strip module
-
-#     out_dict = {}
-#     for k, v in state_dict.items():
-#         if 'relative_position_index' in k:
-#             continue
-#         if 'patch_embed.proj.weight' in k:
-#             O, I, H, W = model.patch_embed.proj.weight.shape
-#             if v.shape[-1] != W or v.shape[-2] != H:
-#                 v = resample_patch_embed(
-#                     v,
-#                     (H, W),
-#                     interpolation=interpolation,
-#                     antialias=antialias,
-#                     verbose=True,
-#                 )
-#         elif k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]:
-#             # To resize pos embedding when using model at different size from pretrained weights
-#             num_prefix_tokens = 1
-#             v = resample_abs_pos_embed(
-#                 v,
-#                 new_size=model.patch_embed.grid_size,
-#                 num_prefix_tokens=num_prefix_tokens,
-#                 interpolation=interpolation,
-#                 antialias=antialias,
-#                 verbose=True,
-#             )
-#         elif k.endswith('relative_position_bias_table'):
-#             m = model.get_submodule(k[:-29])
-#             if v.shape != m.relative_position_bias_table.shape or m.window_size[0] != m.window_size[1]:
-#                 v = resize_rel_pos_bias_table(
-#                     v,
-#                     new_window_size=m.window_size,
-#                     new_bias_shape=m.relative_position_bias_table.shape,
-#                 )
-#         out_dict[k] = v
-#     return out_dict
-
-
-# def _create_beit(variant, pretrained=False, **kwargs):
-#     out_indices = kwargs.pop('out_indices', 3)
-#     model = build_model_with_cfg(
-#         Beit, variant, pretrained,
-#         pretrained_filter_fn=checkpoint_filter_fn,
-#         feature_cfg=dict(out_indices=out_indices, feature_cls='getter'),
-#         **kwargs,
-#     )
-#     return model
-
-
-# @register_model
-# def beit_base_patch16_224(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1)
-#     model = _create_beit('beit_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beit_base_patch16_384(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1)
-#     model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beit_large_patch16_224(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         patch_size=16, embed_dim=1024, depth=24, num_heads=16,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
-#     model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beit_large_patch16_384(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
-#     model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beit_large_patch16_512(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
-#     model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beitv2_base_patch16_224(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
-#     model = _create_beit('beitv2_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
-
-
-# @register_model
-# def beitv2_large_patch16_224(pretrained=False, **kwargs) -> Beit:
-#     model_args = dict(
-#         patch_size=16, embed_dim=1024, depth=24, num_heads=16,
-#         use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
-#     model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
-#     return model
diff --git a/mindnlp/mimm/models/bit.py b/mindnlp/mimm/models/bit.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/parallel/__init__.py b/mindnlp/parallel/__init__.py
deleted file mode 100644
index 6548ed5d4..000000000
--- a/mindnlp/parallel/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MindNLP Parallel modules, which is ported from Megatron."""
-
-from . import tensor_parallel
-from .tensor_parallel import *
-
-__all__ = []
-__all__.extend(tensor_parallel.__all__)
diff --git a/mindnlp/parallel/comm_func.py b/mindnlp/parallel/comm_func.py
deleted file mode 100644
index 7cf2b181f..000000000
--- a/mindnlp/parallel/comm_func.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""communication functional api."""
-from mindspore import ops, Tensor
-from mindspore.ops.operations._inner_ops import Send, Receive
-from mindspore.communication import GlobalComm, get_group_rank_from_world_rank
-from mindspore.ops._primitive_cache import _get_cache_prim
-try:
-    from mindspore._c_expression import TensorPy as Tensor_ # pylint: disable=no-name-in-module
-except:
-    from mindspore._c_expression import Tensor as Tensor_ # pylint: disable=no-name-in-module
-
-
-def isend(tensor, dst=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
-    """
-    Send tensors to the specified dest_rank.
-
-    Note:
-        Send and Receive must be used in combination and have same tag.
-
-    Args:
-        tensor (Tensor): The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
-        dst (int, optional): A required integer identifying the destination rank(global rank). Default: 0.
-        group (str, optional): The communication group to work on.
-            Default: "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
-        tag (int, optional): A required integer identifying the send/recv message tag. The message will
-            be received by the Receive op with the same "tag". Default: 0.
-
-    Raises:
-        TypeError: `dst` is not an int or `group` is not a str。
-        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-
-    Examples:
-        .. note::
-            Before running the following examples, you need to configure the communication environment variables.
-
-            For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
-            without any third-party or configuration file dependencies.
-            Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.1/parallel/msrun_launcher.html>`_
-            for more details.
-
-            This example should be run with 2 devices.
-
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import isend
-        >>> from mindspore import Tensor
-        >>> import numpy as np
-        >>>
-        >>> init()
-        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
-        >>> isend(input_, 0)
-    """
-    if not isinstance(tensor, (Tensor, Tensor_)):
-        raise TypeError("For isend, the input tensor must be tensor")
-    _dst = get_group_rank_from_world_rank(dst, group)
-    _op = _get_cache_prim(Send)(tag, _dst, group, group)
-    _depend = _get_cache_prim(ops.Depend)()
-    return _depend(tensor, _op(tensor))
-
-
-def irecv(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP, tag=0):
-    """
-    Receive tensors from src.
-
-    Note:
-        Send and Receive must be used in combination and have same tag.
-        The shape and dtype of input `tensor` is used to receive tensor, but the value
-        of input `tensor` would not take effect.
-        Only support PyNative mode, Graph mode is not currently supported.
-
-    Args:
-        tensor (Tensor): The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. The shape and dtype of this
-            tensor is used to receive tensor, but the value of input `tensor` would not take effect.
-        src (int, optional): A required integer identifying the source rank(global rank). Default: 0.
-        group (str, optional): The communication group to work on.
-            Default: "hccl_world_group" on Ascend, "nccl_world_group" on GPU.
-        tag (int, optional): A required integer identifying the send/recv message tag. The message will
-            be received by the Send op with the same "tag". Default: 0.
-
-    Returns:
-        Tensor, the shape of output is :math:`(x_1, x_2, ..., x_R)`.
-
-    Raises:
-        TypeError: If `src` is not an int or `group` is not a str.
-        ValueError: If the rank ID of the process is greater than the rank size of the communication group.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-
-    Examples:
-        .. note::
-            Before running the following examples, you need to configure the communication environment variables.
-
-            For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
-            without any third-party or configuration file dependencies.
-            Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.1/parallel/msrun_launcher.html>`_
-            for more details.
-
-            This example should be run with 2 devices.
-
-        >>> from mindspore import ops
-        >>> import mindspore.nn as nn
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import irecv
-        >>> from mindspore import Tensor
-        >>> import numpy as np
-        >>>
-        # Launch 2 processes.
-        Process 0 send the following array to Process 1
-        [[ 0.  1.]
-         [ 2.  3.]]
-        >>> init()
-        >>> x = ms.Tensor(np.zeros([2, 2]))
-        # Process 1 receive tensor from Process 0.
-        >>> out = irecv(x, src=0)
-        >>> print(out)
-        [[ 0.  1.]
-         [ 2.  3.]]
-    """
-    _src = get_group_rank_from_world_rank(src, group)
-    shape = tensor.shape
-    dtype = tensor.dtype
-    _op = _get_cache_prim(Receive)(tag, _src, shape, dtype, group, group)
-    return _op(tensor)
-
-def broadcast(tensor, src=0, group=GlobalComm.WORLD_COMM_GROUP):
-    """
-    Broadcasts the tensor to the whole group.
-
-    Note:
-        The tensors must have the same shape and format in all processes of the collection.
-        Only support PyNative mode, Graph mode is not currently supported.
-
-    Args:
-        tensor (Tensor): The tensor to be broadcasted. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
-        src (int, optional): Specifies the rank(global rank) of the process that broadcast the tensor.
-            And only process `src` will broadcast the tensor.
-        group (str, optional): The communication group to work on. Default: ``GlobalComm.WORLD_COMM_GROUP``.
-
-    Returns:
-        Tensor, tensor has the same shape as input tensor :math:`(x_1, x_2, ..., x_R)`.
-
-    Raises:
-        TypeError: If src is not an integer or group is not a string.
-        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-
-    Examples:
-        .. note::
-            Before running the following examples, you need to configure the communication environment variables.
-
-            For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method
-            without any third-party or configuration file dependencies.
-            Please see the `msrun start up
-            <https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.1/parallel/msrun_launcher.html>`_
-            for more details.
-
-            This example should be run with 2 devices.
-
-        >>> import mindspore as ms
-        >>> from mindspore import Tensor
-        >>> from mindspore.communication import init
-        >>> from mindspore.communication.comm_func import broadcast
-        >>> import numpy as np
-        >>> # Launch 2 processes.
-        >>>
-        >>> init()
-        >>> data = ms.Tensor(np.arange(8).reshape([2, 4]).astype(np.float32))
-        >>> out = broadcast(tensor=data, src=0)
-        [[0. 1. 2. 3.]
-         [4. 5. 6. 7.]]
-
-    Tutorial Examples:
-        - `Distributed Set Communication Primitives - Broadcast
-          <https://www.mindspore.cn/docs/en/r2.3.1/api_python/samples/ops/communicate_ops.html#broadcast>`_
-
-    """
-    if not isinstance(tensor, (Tensor, Tensor_)):
-        raise TypeError("For broadcast, the input tensor must be tensor")
-    if not isinstance(src, int):
-        raise TypeError("For broadcast, the src must be int")
-    _src = get_group_rank_from_world_rank(src, group)
-    _op = _get_cache_prim(ops.Broadcast)(_src, group)
-    return _op((tensor,))[0]
diff --git a/mindnlp/parallel/pipeline_parallel/__init__.py b/mindnlp/parallel/pipeline_parallel/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/parallel/tensor_parallel/__init__.py b/mindnlp/parallel/tensor_parallel/__init__.py
deleted file mode 100644
index aefb3b656..000000000
--- a/mindnlp/parallel/tensor_parallel/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tensor Parallel modules, which is ported from Megatron."""
-
-from . import layers
-from .layers import *
-
-__all__ = []
-__all__.extend(layers.__all__)
diff --git a/mindnlp/parallel/tensor_parallel/layers.py b/mindnlp/parallel/tensor_parallel/layers.py
deleted file mode 100644
index 79315ad14..000000000
--- a/mindnlp/parallel/tensor_parallel/layers.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tensor Parallel Layers"""
-
-from typing import Optional, Union
-
-import mindspore
-from mindspore import Tensor
-from mindspore.common.initializer import Initializer, Zero
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import Parameter
-from .mappings import _get_rank, _get_group_size
-
-
-from .mappings import (
-    copy_to_model_parallel_region,
-    gather_from_model_parallel_region,
-    reduce_from_model_parallel_region,
-    scatter_to_model_parallel_region,
-)
-from .utils import VocabUtility, divide_and_check_no_remainder
-
-
-class VocabParallelEmbedding(nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    This is mainly adapted from mindspore.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        vocab_size: vocabulary size.
-        embedding_size: size of hidden state.
-        init_method: method to initialize weights.
-    """
-    def __init__(
-        self,
-        vocab_size: int,
-        embedding_size: int,
-        padding_idx: Optional[int] = None,
-        init_method: Union[str, Initializer] = Zero(),
-        dtype: mindspore.dtype = mindspore.float32,
-    ) -> None:
-        r"""
-        Args:
-            vocab_size (int): The size of the vocabulary.
-            embedding_size (int): The size of the word embeddings.
-            padding_idx (Optional[int], optional): The index for padding. Defaults to None.
-            init_method (Union[str, Initializer]): The method for initializing the embedding weights. Can be a string representing the method (e.g., 'Zero') or an instance of the Initializer class.
-            dtype (mindspore.dtype): The data type of the embedding weights. Defaults to mindspore.float32.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            ValueError: If vocab_size is not a positive integer.
-            ValueError: If embedding_size is not a positive integer.
-            ValueError: If padding_idx is not None or a positive integer.
-            TypeError: If init_method is not a string or an instance of Initializer.
-            ValueError: If dtype is not a valid mindspore data type.
-        """
-        super().__init__()
-        # Keep the input dimensions.
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.padding_idx = padding_idx
-        # Divide the weight matrix along the vocaburaly dimension.
-        (
-            self.vocab_start_index,
-            self.vocab_end_index,
-        ) = VocabUtility.vocab_range_from_global_vocab_size(
-            self.vocab_size, _get_rank(), _get_group_size()
-        )
-        self.vocab_size_per_partition = self.vocab_end_index - self.vocab_start_index
-
-        # Allocate weights.
-        self.weight = Parameter(Tensor(shape=(self.vocab_size_per_partition, self.embedding_size),
-                                       dtype=dtype, init=init_method),
-                                "weight")
-
-    def forward(self, input_: Tensor) -> Tensor:  # type: ignore
-        r"""
-        Constructs a parallel embedding for the given input tensor.
-        
-        Args:
-            self (VocabParallelEmbedding): An instance of the VocabParallelEmbedding class.
-            input_ (Tensor): The input tensor to forward the parallel embedding for.
-        
-        Returns:
-            Tensor: A tensor representing the parallel embedding of the input tensor.
-        
-        Raises:
-            None
-        
-        This method forwards a parallel embedding for the input tensor by performing the following steps:
-        
-        1. Create an input mask by checking if each element in the input tensor is less than the vocab start index or greater than or equal to the vocab end index.
-        2. Subtract the vocab start index from the input tensor to obtain a masked input tensor.
-        3. Replace the masked elements in the input tensor with 0 using the input mask.
-        4. Get the shape of the masked input tensor.
-        5. Gather the embedding weights from the VocabParallelEmbedding instance using the masked input tensor as indices.
-        6. Reshape the gathered embedding weights to match the original shape of the masked input tensor.
-        7. Fill the masked elements in the gathered embedding weights with 0.0 using the input mask.
-        8. Reduce the output tensor from the model parallel region using the output_parallel tensor.
-        9. Return the final output tensor.
-        
-        Note: The vocab start index and vocab end index are properties of the VocabParallelEmbedding class, and the embedding size is also a property of the class.
-        """
-        # Build the mask.
-        input_mask = (input_ < self.vocab_start_index) | (
-            input_ >= self.vocab_end_index
-        )
-        # Mask the input.
-        masked_input = input_ - self.vocab_start_index
-        masked_input = ops.masked_fill(masked_input, input_mask, 0)
-        # Get the embeddings.
-        ori_shape = masked_input.shape
-        output_parallel = ops.gather(self.weight, masked_input.view(-1), 0).view(
-            ori_shape + (self.embedding_size, )
-        )
-        # Mask the output embedding.
-        output_parallel = ops.masked_fill(
-            output_parallel, input_mask.unsqueeze(-1), 0.0
-        )
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_model_parallel_region(output_parallel)
-        return output
-
-
-class ParallelEmbedding(nn.Module):
-    """Embedding parallelized in the embedding dimension.
-
-    This is mainly adapted from mindspore.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        vocab_size: vocabulary size.
-        embedding_size: size of hidden state.
-        init_method: method to initialize weights.
-    """
-    def __init__(
-        self,
-        vocab_size: int,
-        embedding_size: int,
-        padding_idx: Optional[int] = None,
-        init_method: Union[str, Initializer] = Zero(),
-        dtype: mindspore.dtype = mindspore.float32,
-    ) -> None:
-        r"""Initialize the ParallelEmbedding class.
-        
-        Args:
-            vocab_size (int): The size of the vocabulary.
-            embedding_size (int): The size of each embedding vector.
-            padding_idx (Optional[int], optional): The index used for padding. Defaults to None.
-            init_method (Union[str, Initializer], optional): The method used for initializing the weight tensor. 
-                Can be a string representing the method name or an instance of mindspore.nn.initializer.Initializer.
-                Defaults to Zero().
-            dtype (mindspore.dtype, optional): The data type of the weight tensor. Defaults to mindspore.float32.
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        """
-        super().__init__()
-        # Keep the input dimensions.
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.padding_idx = padding_idx
-        # Divide the weight matrix along the embedding dimension.
-        rank_size = _get_group_size()
-        self.embedding_size_per_partition = divide_and_check_no_remainder(
-            self.embedding_size, rank_size
-        )
-
-        # Allocate weights.
-        self.weight = Parameter(Tensor(shape=(self.vocab_size, self.embedding_size_per_partition),
-                                       dtype=dtype, init=init_method),
-                                "weight")
-
-    def forward(self, input_: Tensor) -> Tensor:  # type: ignore
-        r"""
-        Constructs the parallel embedding for the given input tensor.
-        
-        Args:
-            self (ParallelEmbedding): The instance of the ParallelEmbedding class.
-            input_ (Tensor): The input tensor for which the parallel embedding is to be forwarded. It should be a tensor compatible with the model parallel region.
-        
-        Returns:
-            Tensor: The forwarded parallel embedding tensor of type Tensor. The shape and size of the tensor are determined by the input tensor and the embedding size per partition.
-        
-        Raises:
-            ModelParallelRegionError: If the input tensor is not compatible with the model parallel region.
-            TensorShapeError: If the shape of the input tensor does not match the expected shape for forwarding the parallel embedding.
-            UnsupportedOperationError: If the operation is not supported for the given input tensor or embedding size per partition.
-        """
-        input_parallel = copy_to_model_parallel_region(input_)
-        ori_shape = input_parallel.shape
-        output_parallel = ops.gather(self.weight, input_parallel.view(-1), 0).view(
-            ori_shape + (self.embedding_size_per_partition, )
-        )
-        output = gather_from_model_parallel_region(output_parallel)
-        return output
-
-
-class ColumnParallelLinear(nn.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-
-    Arguments:
-        in_features: first dimension of matrix A.
-        out_features: second dimension of matrix A.
-        bias: If true, add bias
-        gather_output: If true, call all-gether on output and make Y avaiable
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-    """
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        gather_output: bool = True,
-        init_method: Union[str, Initializer] = Zero(),
-        dtype: mindspore.dtype = mindspore.float32,
-        stride: int = 1,
-        keep_master_weight_for_test: bool = False,
-    ) -> None:
-        r"""
-        __init__
-        
-        Initialize the ColumnParallelLinear class.
-        
-        Args:
-            self: The object itself.
-            in_features (int): The size of each input sample.
-            out_features (int): The size of each output sample.
-            bias (bool, optional): If set to True, a bias term is included. Default is True.
-            gather_output (bool, optional): If set to True, the output from all devices will be gathered. Default is True.
-            init_method (Union[str, Initializer]): The method used for weight initialization. It can be a string specifying the method or an instance of Initializer. Default is Zero().
-            dtype (mindspore.dtype): The data type of the parameters. Default is mindspore.float32.
-            stride (int, optional): The stride of the convolution. Default is 1.
-            keep_master_weight_for_test (bool, optional): If set to True, master weight will be kept for testing. Default is False.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - TypeError: If in_features, out_features, stride are not integers or if dtype is not a valid mindspore data type.
-            - ValueError: If out_features is not divisible by the rank size or if init_method is not a valid initialization method.
-            - RuntimeError: If an error occurs during parameter initialization.
-        """
-        super().__init__()
-
-        # Keep input parameters
-        self.in_features = in_features
-        self.out_features = out_features
-        self.gather_output = gather_output
-        # Divide the weight matrix along the last dimension.
-        rank_size = _get_group_size()
-        self.output_size_per_partition = divide_and_check_no_remainder(
-            out_features, rank_size
-        )
-
-        # Parameters.
-        self.weight = Parameter(Tensor(shape=(self.in_features, self.output_size_per_partition),
-                                       dtype=dtype, init=init_method),
-                                "weight")
-        if bias:
-            # Always initialize bias to zero.
-            self.bias = Parameter(Tensor(shape=(self.output_size_per_partition,),
-                                         dtype=dtype, init=init_method),
-                                  "bias")
-        else:
-            self.bias = None
-
-    def get_master_weight(self) -> Tensor:
-        """get master weight of ColumnParallelLinear"""
-        return gather_from_model_parallel_region(self.weight).swapaxes(0, 1)
-
-    def forward(self, input_: Tensor) -> Tensor:  # type: ignore
-        r"""
-        Constructs the ColumnParallelLinear layer.
-        
-        Args:
-            self (ColumnParallelLinear): An instance of the ColumnParallelLinear class.
-            input_ (Tensor): The input tensor to the layer. It must have the shape (batch_size, input_dim).
-        
-        Returns:
-            Tensor: The output tensor of the layer. It will have the shape (batch_size, output_dim).
-        
-        Raises:
-            None.
-        """
-        # Set up backprop all-reduce.
-        input_parallel = copy_to_model_parallel_region(input_)
-        # Matrix multiply.
-        output_parallel = ops.matmul(input_parallel, self.weight)
-        if self.bias is not None:
-            output_parallel = output_parallel + self.bias
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_from_model_parallel_region(output_parallel)
-        else:
-            output = output_parallel
-        return output
-
-
-class RowParallelLinear(nn.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        in_features: first dimension of matrix A.
-        out_features: second dimension of matrix A.
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-    """
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        input_is_parallel: bool = False,
-        init_method: Union[str, Initializer] = Zero(),
-        dtype: mindspore.dtype = mindspore.float32,
-        stride: int = 1,
-        keep_master_weight_for_test: bool = False,
-    ):
-        r"""
-        Initializes a RowParallelLinear object.
-        
-        Args:
-            in_features (int): Number of input features.
-            out_features (int): Number of output features.
-            bias (bool, optional): Whether to include bias in the linear transformation. Default is True.
-            input_is_parallel (bool, optional): Whether the input is parallelized. Default is False.
-            init_method (Union[str, Initializer], optional): Initialization method for weights. Default is Zero().
-            dtype (mindspore.dtype, optional): Data type of the tensors. Default is mindspore.float32.
-            stride (int, optional): Stride value for the linear transformation. Default is 1.
-            keep_master_weight_for_test (bool, optional): Whether to keep the master weight for testing. Default is False.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - ValueError: If 'in_features' or 'out_features' is not an integer.
-            - TypeError: If 'bias', 'input_is_parallel', 'stride', or 'keep_master_weight_for_test' is not a boolean value.
-            - TypeError: If 'init_method' is not a string or an Initializer object.
-            - ValueError: If 'dtype' is not a valid mindspore data type.
-            - ValueError: If 'rank_size' cannot be determined.
-            - ValueError: If there is a remainder when dividing 'in_features' by 'rank_size'.
-            - ValueError: If the shape of the weight tensor does not match the calculated size per partition.
-            - ValueError: If the shape of the bias tensor does not match the number of output features.
-        """
-        super().__init__()
-
-        # Keep input parameters
-        self.in_features = in_features
-        self.out_features = out_features
-        self.input_is_parallel = input_is_parallel
-        # Divide the weight matrix along the last dimension.
-        rank_size = _get_group_size()
-        self.input_size_per_partition = divide_and_check_no_remainder(
-            in_features, rank_size
-        )
-
-        # Parameters.
-        # we allocate the transpose.
-        self.weight = Parameter(Tensor(shape=(self.input_size_per_partition, self.out_features),
-                                       dtype=dtype, init=init_method),
-                                "weight")
-        if bias:
-            # Always initialize bias to zero.
-            self.bias = Parameter(Tensor(shape=(self.out_features,), dtype=dtype, init=init_method), "bias")
-        else:
-            self.bias = None
-
-    def get_master_weight(self) -> Tensor:
-        """get master weight of RowParallelLinear"""
-        return gather_from_model_parallel_region(self.weight).swapaxes(0, 1)
-
-    def forward(self, input_: Tensor) -> Tensor:  # type:ignore
-        r"""
-        This method forwards a linear layer operation in a row-parallel fashion.
-        
-        Args:
-            self (RowParallelLinear): The instance of the RowParallelLinear class.
-            input_ (Tensor): The input tensor to the linear layer operation. It should be of type Tensor.
-        
-        Returns:
-            Tensor: The output tensor resulting from the linear layer operation.
-        
-        Raises:
-            ValueError: If the input tensor is not of the expected type.
-            RuntimeError: If any runtime error occurs during the linear layer operation.
-        """
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            input_parallel = scatter_to_model_parallel_region(input_)
-        # Matrix multiply.
-        output_parallel = ops.matmul(input_parallel, self.weight)
-        # All-reduce across all the partitions.
-        output_ = reduce_from_model_parallel_region(output_parallel)
-        if self.bias is not None:
-            output = output_ + self.bias
-        else:
-            output = output_
-        return output
-
-
-__all__ = [
-    "ColumnParallelLinear",
-    "RowParallelLinear",
-    "VocabParallelEmbedding",
-    "ParallelEmbedding",
-]
diff --git a/mindnlp/parallel/tensor_parallel/mappings.py b/mindnlp/parallel/tensor_parallel/mappings.py
deleted file mode 100644
index 8a58681a0..000000000
--- a/mindnlp/parallel/tensor_parallel/mappings.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tensor Parallel mappings"""
-import mindspore
-from mindspore import nn, ops
-
-from mindspore.ops import constexpr
-from mindspore.communication import GlobalComm
-from mindspore.ops._primitive_cache import _get_cache_prim
-
-from .utils import concat_tensor_along_last_dim, split_tensor_along_last_dim, get_rank, get_group_size
-
-
-@constexpr
-def _get_rank(group=GlobalComm.WORLD_COMM_GROUP):
-    r"""
-    This function returns the rank of the current process within the specified communication group.
-    
-    Args:
-        group (int): The communication group to which the process belongs. Defaults to GlobalComm.WORLD_COMM_GROUP.
-        
-    Returns:
-        None: This function does not return a value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return get_rank(group)
-
-
-@constexpr
-def _get_group_size(group=GlobalComm.WORLD_COMM_GROUP):
-    r"""
-    This function retrieves the size of the specified communication group.
-    
-    Args:
-        group (object): The communication group for which the size needs to be retrieved. Defaults to GlobalComm.WORLD_COMM_GROUP.
-    
-    Returns:
-        None: This function does not return a value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return get_group_size(group)
-
-
-def _reduce(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """All-reduce the the input tensor across model parallel group."""
-    # Bypass the function if we are using only 1 GPU.
-    if _get_group_size() == 1:
-        return input_
-
-    # All-reduce.
-    _all_reduce = _get_cache_prim(ops.AllReduce)()
-    output = _all_reduce(input_)
-
-    return output
-
-def _split(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """Split the tensor along its last dimension and keep the
-    corresponding slice."""
-    # Bypass the function if we are using only 1 GPU.
-    rank_size = _get_group_size()
-    if rank_size == 1:
-        return input_
-
-    # Split along last dimension.
-    input_list = split_tensor_along_last_dim(input_, rank_size)
-
-    rank = _get_rank()
-    output = input_list[rank]
-
-    return output
-
-def _gather(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """Gather tensors and concatinate along the last dimension."""
-    # Bypass the function if we are using only 1 GPU.
-    rank_size = _get_group_size()
-    if rank_size == 1:
-        return input_
-
-    _all_gather = _get_cache_prim(ops.AllGather)()
-    tensor = _all_gather(input_)
-    # # Size and dimension.
-    output = concat_tensor_along_last_dim(tensor, rank_size)
-
-    return output
-
-class _CopyToModelParallelRegion(nn.Cell):
-    """Pass the input to the model parallel region."""
-    def forward(self, input_):
-        r"""
-        Constructs a new instance of the '_CopyToModelParallelRegion' class.
-        
-        Args:
-            self (object): The instance of the '_CopyToModelParallelRegion' class.
-            input_ (Any): The input value to be processed.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return input_
-
-    def bprop(self, input_, out, dout):
-        """_CopyToModelParallelRegion backward method"""
-        return (_reduce(dout),)
-
-
-class _ReduceFromModelParallelRegion(nn.Cell):
-    """All-redcue the input from the model parallel region."""
-    def forward(self, input_):
-        r"""
-        Constructs a new instance of '_ReduceFromModelParallelRegion' class.
-        
-        Args:
-            self (object): The instance of the '_ReduceFromModelParallelRegion' class.
-            input_ (any): The input data to be processed by the method.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return _reduce(input_)
-
-    def bprop(self, input_, out, dout):
-        """_ReduceFromModelParallelRegion backward method"""
-        return (dout, )
-
-
-class _ScatterToModelParallelRegion(nn.Cell):
-    """Split the input and keep only the corresponding chuck to the rank."""
-    def forward(self, input_):
-        r"""
-        Constructs a scatter to model parallel region within the _ScatterToModelParallelRegion class.
-        
-        Args:
-            self (_ScatterToModelParallelRegion): The instance of the _ScatterToModelParallelRegion class.
-            input_ (any): The input data to be processed.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            N/A
-        """
-        return _split(input_)
-
-    def bprop(self, input_, out, dout):
-        """_ScatterToModelParallelRegion backward method"""
-        return (_gather(dout),)
-
-class _GatherFromModelParallelRegion(nn.Cell):
-    """Gather the input from model parallel region and concatinate."""
-    def forward(self, input_):
-        r"""
-        This method forwards a gather operation from the input.
-        
-        Args:
-            self (_GatherFromModelParallelRegion): The instance of the _GatherFromModelParallelRegion class.
-            input_ (object): The input data to be gathered.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - Any exceptions raised by the _gather function when processing the input data.
-        """
-        return _gather(input_)
-
-    def bprop(self, input_, out, dout):
-        """_GatherFromModelParallelRegion backward method"""
-        return (_split(dout),)
-
-_copyToModel = _CopyToModelParallelRegion()
-_reduceFromModel = _ReduceFromModelParallelRegion()
-_scatterToModel = _ScatterToModelParallelRegion()
-_gatherFromModel = _GatherFromModelParallelRegion()
-
-
-def copy_to_model_parallel_region(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """copy to model parallel region"""
-    return _copyToModel(input_)
-
-
-def reduce_from_model_parallel_region(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """reduce from model parallel region"""
-    return _reduceFromModel(input_)
-
-
-def scatter_to_model_parallel_region(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """scatter to model parallel region"""
-    return _scatterToModel(input_)
-
-
-def gather_from_model_parallel_region(input_: mindspore.Tensor) -> mindspore.Tensor:
-    """gather from model parallel region"""
-    return _gatherFromModel(input_)
diff --git a/mindnlp/parallel/tensor_parallel/utils.py b/mindnlp/parallel/tensor_parallel/utils.py
deleted file mode 100644
index ff959cc71..000000000
--- a/mindnlp/parallel/tensor_parallel/utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tensor Parallel Utils"""
-from typing import Tuple
-
-import mindspore
-from mindspore import ops
-from mindspore.communication import GlobalComm
-
-
-def ensure_divisibility(numerator: int, denominator: int) -> None:
-    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, f"{numerator} is not divisible by {denominator}"
-
-
-def divide_and_check_no_remainder(numerator: int, denominator: int) -> int:
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
-    ensure_divisibility(numerator, denominator)
-    return numerator // denominator
-
-
-def split_tensor_along_last_dim(
-    tensor: mindspore.Tensor, num_partitions: int
-) -> Tuple[mindspore.Tensor, ...]:
-    """Split a tensor along its last dimension.
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-    """
-    # Get the size and dimension.
-    last_dim = tensor.ndim - 1
-    # Split.
-    last_dim_size = divide_and_check_no_remainder(tensor.shape[last_dim], num_partitions)
-    tensor_list = ops.split(tensor, last_dim_size, axis=last_dim)
-
-    return tensor_list
-
-def concat_tensor_along_last_dim(tensor, num_partitions):
-    """Concat a tensor along its last dimension."""
-    last_dim = tensor.ndim - 1
-    tensor_list = ops.split(tensor, divide_and_check_no_remainder(tensor.shape[0], num_partitions), axis=0)
-    output = ops.concat(tensor_list, axis=last_dim)
-    return output
-
-
-class VocabUtility:
-    """Split the vocabulary into `rank_size` chunks amd return the
-    first and last index of the vocabulary belonging to the `rank_id`
-    partition: Note that indices in [first, last)"""
-    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(
-        per_partition_vocab_size: int, rank_id: int
-    ) -> Tuple[int, int]:
-        """get vocab range from vocab size of each partition"""
-        index_f = rank_id * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
-    @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank_id: int, rank_size: int) -> Tuple[int, int]:
-        """get vocab range from global vocab size"""
-        per_partition_vocab_size = divide_and_check_no_remainder(global_vocab_size, rank_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank_id)
-
-def get_rank(group=GlobalComm.WORLD_COMM_GROUP):
-    """get rank"""
-    return mindspore.communication.get_rank(group)
-
-def get_group_size(group=GlobalComm.WORLD_COMM_GROUP):
-    """get group size"""
-    return mindspore.communication.get_group_size(group)
diff --git a/mindnlp/peft/__init__.py b/mindnlp/peft/__init__.py
deleted file mode 100644
index 6e50a2130..000000000
--- a/mindnlp/peft/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Parameter effcient fine tuning modules, like huggingface peft."""
-from .mapping import (
-    MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
-    PEFT_TYPE_TO_CONFIG_MAPPING,
-    get_peft_config,
-    get_peft_model,
-    inject_adapter_in_model,
-)
-
-from .peft_model import (
-    PeftModel,
-    PeftModelForCausalLM,
-    PeftModelForFeatureExtraction,
-    # PeftModelForQuestionAnswering,
-    PeftModelForSeq2SeqLM,
-    PeftModelForSequenceClassification,
-    PeftModelForTokenClassification,
-)
-
-from .tuners import (
-    AdaptionPromptConfig,
-    AdaptionPromptModel,
-    LoraConfig,
-    LoraModel,
-    IA3Config,
-    IA3Model,
-    LoKrConfig,
-    LoKrModel,
-    AdaLoraConfig,
-    AdaLoraModel,
-    PromptTuningConfig,
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-    MultitaskPromptTuningConfig,
-    MultitaskPromptTuningInit,
-    LoHaConfig,
-    LoHaModel,
-    PolyConfig,
-    PolyModel,
-    LNTuningConfig,
-    LNTuningModel,
-)
-
-from .utils import (
-    # TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
-    PeftType,
-    TaskType,
-    # bloom_model_postprocess_past_key_value,
-    get_peft_model_state_dict,
-    # prepare_model_for_int8_training,
-    # prepare_model_for_kbit_training,
-    set_peft_model_state_dict,
-    shift_tokens_right,
-    load_peft_weights,
-)
-
-from .config import (
-    PeftConfig,
-    PromptLearningConfig,
-)
diff --git a/mindnlp/peft/config.py b/mindnlp/peft/config.py
deleted file mode 100644
index 3e9cece90..000000000
--- a/mindnlp/peft/config.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""configs"""
-
-import json
-import os
-from dataclasses import asdict, dataclass, field
-from typing import Optional, Union
-
-from .utils import CONFIG_NAME, PeftType, TaskType
-
-@dataclass
-class PeftConfigMixin():
-    r"""
-    This is the base configuration class for PEFT adapter models. 
-    It contains all the methods that are common to all PEFT adapter models.
-    The method `save_pretrained` will save the configuration of your adapter model in a directory.
-    The method `from_pretrained` will load the configuration of your adapter model from a directory.
-
-    Args:
-        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
-    """
-    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
-
-    @property
-    def __dict__(self):
-        r"""
-        Method '__dict__' in the class 'PeftConfigMixin' returns a dictionary representation of the object using the 'asdict' function.
-        
-        Args:
-            self: The instance of the class. This parameter represents the object for which the dictionary representation is generated.
-        
-        Returns:
-            None. The method does not return any value explicitly, as the dictionary representation is retrieved internally.
-        
-        Raises:
-            No exceptions are explicitly raised by this method.
-        """
-        return asdict(self)
-
-    def to_dict(self):
-        """to dict"""
-        return self.__dict__
-
-    def save_pretrained(self, save_directory, **kwargs):
-        r"""
-        This method saves the configuration of your adapter model in a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory where the configuration will be saved.
-            kwargs (additional keyword arguments, *optional*):
-                Additional keyword arguments passed along to the 
-                [`~transformers.utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        output_dict = asdict(self)
-        # converting set type to list
-        for key, value in output_dict.items():
-            if isinstance(value, set):
-                output_dict[key] = list(value)
-        output_path = os.path.join(save_directory, CONFIG_NAME)
-
-        # save it
-        with open(output_path, "w", encoding='utf-8') as writer:
-            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None, **kwargs):
-        r"""
-        This method loads the configuration of your adapter model from a directory.
-
-        Args:
-            pretrained_model_name_or_path (`str`):
-                The directory or the Hub repository id where the configuration is saved.
-            kwargs (additional keyword arguments, *optional*):
-                Additional keyword arguments passed along to the child class initialization.
-        """
-        path = (
-            os.path.join(pretrained_model_name_or_path, subfolder)
-            if subfolder is not None
-            else pretrained_model_name_or_path
-        )
-        # read config file
-        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
-            config_file = os.path.join(path, CONFIG_NAME)
-        else:
-            raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'")
-
-        loaded_attributes = cls.from_json_file(config_file)
-
-        config = cls(**kwargs)
-
-        for key, value in loaded_attributes.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-
-        return config
-
-    @classmethod
-    def from_json_file(cls, path_json_file, **kwargs):
-        r"""
-        Loads a configuration file from a json file.
-
-        Args:
-            path_json_file (`str`):
-                The path to the json file.
-        """
-        with open(path_json_file, "r", encoding='utf-8') as file:
-            json_object = json.load(file)
-
-        return json_object
-
-    @property
-    def is_prompt_learning(self):
-        r"""
-        Utility method to check if the configuration is for prompt learning.
-        """
-        return False
-
-    @property
-    def is_adaption_prompt(self) -> bool:
-        """Return True if this is an adaption prompt config."""
-        return False
-
-
-@dataclass
-class PeftConfig(PeftConfigMixin):
-    """
-    This is the base configuration class to store the configuration of a [`PeftModel`].
-
-    Args:
-        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
-        task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
-        inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
-    """
-    base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
-    peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
-    task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
-    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
-
-
-# ok
-@dataclass
-class PromptLearningConfig(PeftConfig):
-    """
-    This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or
-    [`PromptTuning`].
-
-    Args:
-        num_virtual_tokens (`int`): The number of virtual tokens to use.
-        token_dim (`int`): The hidd-en embedding dimension of the base transformer model.
-        num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model.
-        num_attention_heads (`int`): The number of attention heads in the base transformer model.
-        num_layers (`int`): The number of layers in the base transformer model.
-    """
-    num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"})
-    token_dim: int = field(
-        default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"}
-    )
-    num_transformer_submodules: Optional[int] = field(
-        default=None, metadata={"help": "Number of transformer submodules"}
-    )
-    num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
-    num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})
-    @property
-    def is_prompt_learning(self):
-        r"""
-        Utility method to check if the configuration is for prompt learning.
-        """
-        return True
diff --git a/mindnlp/peft/mapping.py b/mindnlp/peft/mapping.py
deleted file mode 100644
index 7f8bfb416..000000000
--- a/mindnlp/peft/mapping.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""mappings"""
-from typing import Any, Dict
-
-from mindnlp.core import nn
-
-from .config import PeftConfig
-from .peft_model import (
-    PeftModel,
-    PeftModelForCausalLM,
-    # PeftModelForFeatureExtraction,
-    # PeftModelForQuestionAnswering,
-    PeftModelForSeq2SeqLM,
-    PeftModelForSequenceClassification,
-    PeftModelForTokenClassification,
-)
-from .tuners import (
-    AdaLoraConfig,
-    AdaLoraModel,
-    LoraConfig,
-    LoraModel,
-    AdaptionPromptConfig,
-    IA3Config,
-    IA3Model,
-    LoKrConfig,
-    LoKrModel,
-    PromptTuningConfig,
-    LoHaConfig,
-    LoHaModel,
-    PolyConfig,
-    PolyModel,
-    LNTuningConfig,
-    LNTuningModel,
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-)
-
-MODEL_TYPE_TO_PEFT_MODEL_MAPPING = {
-    "SEQ_CLS": PeftModelForSequenceClassification,
-    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
-    "CAUSAL_LM": PeftModelForCausalLM,
-    "TOKEN_CLS": PeftModelForTokenClassification,
-    # "QUESTION_ANS": PeftModelForQuestionAnswering,
-    # "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
-}
-
-
-PEFT_TYPE_TO_CONFIG_MAPPING = {
-    "PROMPT_TUNING": PromptTuningConfig,
-    "PREFIX_TUNING": PrefixTuningConfig,
-    "P_TUNING": PromptEncoderConfig,
-    "ADAPTION_PROMPT": AdaptionPromptConfig,
-    "LORA": LoraConfig,
-    "ADALORA": AdaLoraConfig,
-    "IA3": IA3Config,
-    "LOKR": LoKrConfig,
-    "LOHA": LoHaConfig,
-    "POLY": PolyConfig,
-    "LN_TUNING": LNTuningConfig,
-}
-
-PEFT_TYPE_TO_TUNER_MAPPING = {
-    "LORA": LoraModel,
-    "ADALORA": AdaLoraModel,
-    "IA3": IA3Model,
-    "LOKR": LoKrModel,
-    "LOHA": LoHaModel,
-    "POLY": PolyModel,
-    "LN_TUNING": LNTuningModel,
-}
-
-
-def get_peft_config(config_dict: Dict[str, Any]):
-    """
-    Returns a Peft config object from a dictionary.
-
-    Args:
-        config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters.
-    """
-    return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)
-
-
-def get_peft_model(model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> PeftModel:
-    """
-    Returns a Peft model object from a model and a config.
-
-    Args:
-        model ([`transformers.PreTrainedModel`]): Model to be wrapped.
-        peft_config ([`PeftConfig`]): Configuration object containing the parameters of the Peft model.
-    """
-    model_config = getattr(model, "config", {"model_type": "custom"})
-    if hasattr(model_config, "to_dict"):
-        model_config = model_config.to_dict()
-    peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None)
-
-    # no specific task_type and is not prompt_learning
-    if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
-        return PeftModel(model, peft_config, adapter_name=adapter_name)
-
-    # TODO: prompt learning
-    # if peft_config.is_prompt_learning:
-    #     # peft_config = _prepare_prompt_learning_config(peft_config, model_config)
-    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
-
-def inject_adapter_in_model(
-    peft_config: PeftConfig, model: nn.Module, adapter_name: str = "default"
-) -> nn.Module:
-    r"""
-    A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
-    methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
-    calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods.
-
-    Args:
-        peft_config (`PeftConfig`):
-            Configuration object containing the parameters of the Peft model.
-        model (`nn.Module`):
-            The input model where the adapter will be injected.
-        adapter_name (`str`, `optional`, defaults to `"default"`):
-            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
-    """
-    if peft_config.is_prompt_learning or peft_config.is_adaption_prompt:
-        raise ValueError("`create_and_replace` does not support prompt learning and adaption prompt yet.")
-
-    if peft_config.peft_type not in PEFT_TYPE_TO_TUNER_MAPPING.keys():
-        raise ValueError(
-            f"`inject_adapter_in_model` does not support {peft_config.peft_type} yet. Please use `get_peft_model`."
-        )
-
-    tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
-
-    # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
-    peft_model = tuner_cls(model, peft_config, adapter_name=adapter_name)
-
-    return peft_model.model
diff --git a/mindnlp/peft/peft_model.py b/mindnlp/peft/peft_model.py
deleted file mode 100644
index 48eeb7b6e..000000000
--- a/mindnlp/peft/peft_model.py
+++ /dev/null
@@ -1,1203 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""PEFT model."""
-import os
-import warnings
-import inspect
-from contextlib import contextmanager
-from copy import deepcopy
-from typing import Dict, Optional
-
-import mindspore
-from mindspore import Tensor
-from mindspore.train.serialization import _exec_save
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-from .config import PeftConfig, PromptLearningConfig
-from ..transformers import PreTrainedModel
-
-from .tuners import (
-    AdaLoraModel,
-    AdaptionPromptModel,
-    LoraModel,
-    IA3Model,
-    LoKrModel,
-    # LoraConfig,
-    PromptEmbedding,
-    MultitaskPromptEmbedding,
-    PromptEncoder,
-    PrefixEncoder,
-    LoHaModel,
-    PolyModel,
-    LNTuningModel,
-)
-from .utils import (
-    # SAFETENSORS_WEIGHTS_NAME,
-    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
-    WEIGHTS_NAME,
-    PeftType,
-    TaskType,
-    _prepare_prompt_learning_config,
-    # _set_adapter,
-    _set_trainable,
-    # add_library_to_model_card,
-    get_peft_model_state_dict,
-    load_peft_weights,
-    set_peft_model_state_dict,
-    shift_tokens_right,
-    _get_batch_size, # will be used for prompt learning methods
-)
-
-
-PEFT_TYPE_TO_MODEL_MAPPING = {
-    PeftType.LORA: LoraModel,
-    PeftType.ADAPTION_PROMPT: AdaptionPromptModel,
-    PeftType.IA3: IA3Model,
-    PeftType.ADALORA: AdaLoraModel,
-    PeftType.LOKR: LoKrModel,
-    PeftType.LOHA: LoHaModel,
-    PeftType.POLY: PolyModel,
-    PeftType.LN_TUNING: LNTuningModel,
-}
-
-class PeftModel(nn.Module):
-    """
-    Base model encompassing various Peft methods.
-
-    Args:
-        model ([`~mindnlp.models.PreTrainedModel`]): The base transformer model used for Peft.
-        peft_config ([`PeftConfig`]): The configuration of the Peft model.
-    """
-    def __init__(self, model, peft_config: PeftConfig, adapter_name="default"):
-        r"""
-        __init__
-
-        This method initializes an instance of the PeftModel class.
-
-        Args:
-            self: The instance of the PeftModel class.
-            model: The base model used for the PeftModel instance.
-            peft_config (PeftConfig): An instance of PeftConfig class containing configuration for the PEFT (Prompt-based Entity Fine-Tuning) process.
-            adapter_name (str, optional): The name of the adapter being used. Defaults to 'default'.
-
-        Returns:
-            None. This method does not return any value.
-
-        Raises:
-            - TypeError: If the provided model is not of the expected type.
-            - ValueError: If the provided peft_config is not valid or does not contain necessary information.
-            - KeyError: If there is an issue with accessing or setting attributes.
-        """
-        super().__init__()
-        self.base_model = model
-        self.config = getattr(self.base_model, "config", {"model_type": "custom"})
-        self.modules_to_save = None
-        self.peft_config: Dict[str, PeftConfig] = {}
-        self.active_adapter = adapter_name
-        self.peft_type = peft_config.peft_type
-        self.base_model_dtype = getattr(model, "dtype", None)
-        self.special_peft_forward_args = {"adapter_name"}
-        if not peft_config.is_prompt_learning:
-            self.peft_config[adapter_name] = peft_config
-            self.base_model = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type](
-                self.base_model, self.peft_config, adapter_name
-            )
-            self.set_additional_trainable_modules(peft_config, adapter_name)
-        else:
-            self.add_adapter(adapter_name, peft_config)
-
-        # if getattr(model, "is_gradient_checkpointing", True):
-        #     model = self._prepare_model_for_gradient_checkpointing(model)
-        # if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
-        #     self.base_model.config.pretraining_tp = 1
-
-    def save_pretrained(self, save_directory, **kwargs):
-        r"""
-        This function saves the adapter model and the adapter configuration files to a directory, so that it can be
-        reloaded using the [`LoraModel.from_pretrained`] class method, and also used by the [`LoraModel.push_to_hub`]
-        method.
-        """
-        if os.path.isfile(save_directory):
-            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-
-        for adapter_name, peft_config in self.peft_config.items():
-            # save only the trainable weights
-            output_state_dict = get_peft_model_state_dict(
-                self,
-                state_dict=kwargs.get("state_dict", None),
-                adapter_name=adapter_name
-            )
-            output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory
-            os.makedirs(output_dir, exist_ok=True)
-
-            _exec_save(
-                ckpt_file_name=os.path.join(output_dir, WEIGHTS_NAME),
-                data_list=output_state_dict,
-            )
-
-            # save the config and change the inference mode to `True`
-            if peft_config.base_model_name_or_path is None:
-                peft_config.base_model_name_or_path = (
-                    self.base_model.__dict__.get("name_or_path", None),
-                    self.base_model.model.__dict__.get("name_or_path", None)
-                )
-            inference_mode = peft_config.inference_mode
-            peft_config.inference_mode = True
-            peft_config.save_pretrained(output_dir)
-            peft_config.inference_mode = inference_mode
-
-    @classmethod
-    def from_pretrained(cls, model, model_id, adapter_name="default", is_trainable=False, **kwargs):
-        r"""
-        Instantiate a [`LoraModel`] from a pretrained Lora configuration and weights.
-
-        Args:
-            model ([`~transformers.PreTrainedModel`]):
-                The model to be adapted. The model should be initialized with the
-                [`~transformers.PreTrainedModel.from_pretrained`] method from the 🤗 Transformers library.
-            model_id (`str` or `os.PathLike`):
-                The name of the Lora configuration to use. Can be :
-                    - A path to a directory containing a Lora configuration file saved using the `save_pretrained`
-                      method (`./my_lora_config_directory/`).
-        """
-        from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING
-        # load peft config
-        config = PEFT_TYPE_TO_CONFIG_MAPPING[
-            PeftConfig.from_pretrained(model_id, subfolder=kwargs.get("subfolder", None)).peft_type
-        ].from_pretrained(model_id, subfolder=kwargs.get("subfolder", None))
-
-        config.inference_mode = not is_trainable
-
-        if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING:
-            model = cls(model, config, adapter_name)
-        else:
-            model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config, adapter_name)
-        model.load_adapter(model_id, adapter_name, **kwargs)
-        return model
-
-    def _setup_prompt_encoder(self, adapter_name: str):
-        config = self.peft_config[adapter_name]
-        if not hasattr(self, "prompt_encoder"):
-            self.prompt_encoder = nn.ModuleDict({})
-            self.prompt_tokens = {}
-        transformer_backbone = None
-        for name, module in self.base_model.named_children():
-            for param in module.parameters():
-                param.requires_grad = False
-            if isinstance(module, PreTrainedModel):
-                # Make sure to freeze Tranformers model
-                if transformer_backbone is None:
-                    transformer_backbone = module
-                    self.transformer_backbone_name = name
-        if transformer_backbone is None:
-            transformer_backbone = self.base_model
-
-        if config.num_transformer_submodules is None:
-            config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1
-
-        for named_param, value in list(transformer_backbone.named_parameters()):
-            # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape [0]
-            # the actual unsharded shape is stored in "ds_shape" attribute
-            # special handling is needed in case the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig
-            # has been called before
-            # For reference refer to issue: https://github.com/huggingface/peft/issues/996
-
-            if value.shape[0] == self.base_model.config.vocab_size:
-                self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
-                break
-
-        if config.peft_type == PeftType.PROMPT_TUNING:
-            prompt_encoder = PromptEmbedding(config, self.word_embeddings)
-        elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-            prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings)
-        elif config.peft_type == PeftType.P_TUNING:
-            prompt_encoder = PromptEncoder(config)
-        elif config.peft_type == PeftType.PREFIX_TUNING:
-            prompt_encoder = PrefixEncoder(config)
-        else:
-            raise ValueError("Not supported")
-
-        self.prompt_encoder.update(nn.ModuleDict({adapter_name: prompt_encoder}))
-        self.prompt_tokens[adapter_name] = ops.arange(
-            config.num_virtual_tokens * config.num_transformer_submodules
-        ).long()
-
-    def load_adapter(self, model_id: str, adapter_name: str, is_trainable: bool = False, **kwargs):
-        """load adapter to peft model, called by `model.from_pretrained`."""
-        # NOTE: remove download logic.
-        if adapter_name not in self.peft_config:
-            raise ValueError(f"{adapter_name} is not a valid adapter name. Valid names: {self.peft_config.keys()}")
-
-        adapters_weights = load_peft_weights(model_id)
-
-        # load the weights into the model
-        load_result = set_peft_model_state_dict(self, adapters_weights, adapter_name=adapter_name)
-        # TODO: add parallel logic & offload logic & device map logic(dispatch_model)
-
-        # Set model in evaluation mode to deactivate Dropout modules by default
-        if not is_trainable:
-            self.set_train(False)
-
-        return load_result
-
-    def get_nb_trainable_parameters(self):
-        r"""
-        Returns the number of trainable parameters and the number of all parameters in the model.
-        """
-        trainable_params = 0
-        all_param = 0
-        for _, param in self.named_parameters():
-            num_params = param.numel()
-            # if using DS Zero 3 and the weights are initialized empty
-            if num_params == 0 and hasattr(param, "ds_numel"):
-                num_params = param.ds_numel
-
-            # Due to the design of 4bit linear layers from bitsandbytes
-            # one needs to multiply the number of parameters by 2 to get
-            # the correct number of parameters
-            if param.__class__.__name__ == "Params4bit":
-                if hasattr(param, "element_size"):
-                    num_bytes = param.element_size()
-                elif not hasattr(param, "quant_storage"):
-                    num_bytes = 1
-                else:
-                    num_bytes = param.quant_storage.itemsize
-                num_params = num_params * 2 * num_bytes
-
-            all_param += num_params
-            if param.requires_grad:
-                trainable_params += num_params
-
-        return trainable_params, all_param
-
-    def get_prompt_embedding_to_save(self, adapter_name: str) -> mindspore.Tensor:
-        """
-        Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning
-        method.
-        """
-        prompt_encoder = self.prompt_encoder[adapter_name]
-        prompt_tokens = (
-            self.prompt_tokens[adapter_name].unsqueeze(0).broadcast_to((1, -1))
-        )
-        if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING:
-            prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens]
-
-        if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-            prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens) # pylint: disable=bad-super-call
-        else:
-            prompt_embeddings = prompt_encoder(prompt_tokens)
-
-        embedding = prompt_embeddings[0]
-        return Tensor(embedding.asnumpy())
-
-    def get_prompt(self, batch_size: int, task_ids: Optional[mindspore.Tensor] = None) -> mindspore.Tensor:
-        """
-        Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method.
-        """
-        peft_config = self.active_peft_config
-        prompt_encoder = self.prompt_encoder[self.active_adapter]
-        prompt_tokens = (
-            self.prompt_tokens[self.active_adapter]
-            .unsqueeze(0)
-            .broadcast_to((batch_size, -1))
-        )
-        if peft_config.peft_type == PeftType.PREFIX_TUNING:
-            prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens]
-            if peft_config.inference_mode:
-                past_key_values = prompt_encoder.embedding.weight.tile((batch_size, 1, 1))
-            else:
-                past_key_values = prompt_encoder(prompt_tokens)
-            if self.base_model_dtype is not None:
-                past_key_values = past_key_values.to(self.base_model_dtype)
-            past_key_values = past_key_values.view(
-                batch_size,
-                peft_config.num_virtual_tokens,
-                peft_config.num_layers * 2,
-                peft_config.num_attention_heads,
-                peft_config.token_dim // peft_config.num_attention_heads,
-            )
-            if peft_config.num_transformer_submodules == 2:
-                past_key_values = ops.cat([past_key_values, past_key_values], dim=2)
-            past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(
-                peft_config.num_transformer_submodules * 2
-            )
-            if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None:
-                post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
-                past_key_values = post_process_fn(past_key_values)
-            return past_key_values
-        else:
-            if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-                prompts = prompt_encoder(prompt_tokens, task_ids)
-            else:
-                if peft_config.inference_mode:
-                    prompts = prompt_encoder.embedding.weight.tile((batch_size, 1, 1))
-                else:
-                    prompts = prompt_encoder(prompt_tokens)
-            return prompts
-
-    def print_trainable_parameters(self):
-        """
-        Prints the number of trainable parameters in the model.
-        """
-        trainable_params, all_param = self.get_nb_trainable_parameters()
-
-        print(
-            f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}"
-        )
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.base_model, name)
-
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass of the model.
-        """
-        # print(self.get_base_model().layers[0].__class__.forward)
-        return self.get_base_model()(*args, **kwargs)
-
-    def generate(self, *args, **kwargs):
-        return self.get_base_model().generate(*args, **kwargs)
-
-    @contextmanager
-    def disable_adapter(self):
-        """
-        Disables the adapter cell.
-        """
-        try:
-            self.base_model.disable_adapter_layers()
-            yield
-        finally:
-            self.base_model.enable_adapter_layers()
-
-    def get_base_model(self):
-        """
-        Returns the base model.
-        """
-        return (
-            self.base_model
-            if self.active_peft_config.is_prompt_learning
-            or self.peft_type == PeftType.POLY
-            else self.base_model.model
-        )
-
-    def add_adapter(self, adapter_name: str, peft_config: PeftConfig):
-        """add adapter."""
-        if peft_config.peft_type != self.peft_type:
-            raise ValueError(
-                f"Cannot combine adapters with different peft types. "
-                f"Found {self.peft_type} and {peft_config.peft_type}."
-            )
-
-        self.peft_config[adapter_name] = peft_config
-
-        try:
-            if peft_config.is_prompt_learning:  # add_adapter methods for prompt learning setup
-                if hasattr(self.config, "to_dict"):
-                    dict_config = self.config.to_dict()
-                else:
-                    dict_config = self.config
-
-                peft_config = _prepare_prompt_learning_config(peft_config, dict_config)
-                self._setup_prompt_encoder(adapter_name)
-            # elif peft_config.is_adaption_prompt:
-            #     self.base_model.add_adapter(adapter_name, peft_config)
-            else:
-                # inject adapter into base model (load model instead of initialize new one)
-                self.base_model.inject_adapter(self, adapter_name)
-        except Exception:  # somthing went wrong, roll back
-            del self.peft_config[adapter_name]
-            raise
-
-        self.set_additional_trainable_modules(peft_config, adapter_name)
-
-    def set_additional_trainable_modules(self, peft_config, adapter_name):
-        """set additional trainable modules"""
-        if getattr(peft_config, "modules_to_save", None) is not None:
-            if self.modules_to_save is None:
-                self.modules_to_save = set(peft_config.modules_to_save)
-            else:
-                self.modules_to_save.update(peft_config.modules_to_save)
-            _set_trainable(self, adapter_name)
-
-    @property
-    def active_peft_config(self):
-        """active_peft_config"""
-        return self.peft_config[self.active_adapter]
-
-
-class PeftModelForSequenceClassification(PeftModel):
-    """
-    Peft model for sequence classification tasks.
-
-    Args:
-        model ([`~mindnlp.models.PreTrainedModel`]): Base transformer model.
-        peft_config ([`PeftConfig`]): Peft config.
-
-    """
-    def __init__(self, model, peft_config: PeftConfig, adapter_name="default"):
-        """
-        Initializes a new instance of the PeftModelForSequenceClassification class.
-
-        Args:
-            self: The instance of the PeftModelForSequenceClassification class.
-            model: The base model to be used for sequence classification (e.g., a pre-trained language model).
-            peft_config (PeftConfig): The configuration for the PEFT (Probing and Evaluation for Transformers) model.
-            adapter_name (str, optional): The name of the adapter to be used. Defaults to 'default'.
-
-        Returns:
-            None. This method initializes the instance with the specified parameters.
-
-        Raises:
-            None.
-        """
-        super().__init__(model, peft_config, adapter_name)
-        if self.modules_to_save is None:
-            self.modules_to_save = {"classifier", "score"}
-        else:
-            self.modules_to_save.update({"classifier", "score"})
-
-        for name, _ in self.base_model.named_modules():
-            if any(module_name in name for module_name in self.modules_to_save):
-                self.cls_layer_name = name
-                break
-
-        # to make sure classifier layer is trainable
-        _set_trainable(self, adapter_name)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        task_ids=None,
-        **kwargs,
-    ):
-        """
-        Forward pass of the model.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        peft_config = self.active_peft_config
-        if not peft_config.is_prompt_learning:
-            # NOTE:some args not exists in base model
-            # inputs_embeds=inputs_embeds,
-            # output_attentions=output_attentions,
-            # output_hidden_states=output_hidden_states,
-            # return_dict=return_dict,
-
-            if peft_config.peft_type == PeftType.POLY:
-                kwargs["task_ids"] = task_ids
-            return self.base_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels,
-                **kwargs,
-            )
-
-        batch_size = _get_batch_size(input_ids, inputs_embeds)
-        if attention_mask is not None:
-            # concat prompt attention mask
-            prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens, dtype=attention_mask.dtype)
-            attention_mask = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-        if kwargs.get("position_ids", None) is not None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            kwargs["position_ids"] = None
-        kwargs.update(
-            {
-                "attention_mask": attention_mask,
-                "labels": labels,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-            }
-        )
-
-        # if peft_config.peft_type == PeftType.PREFIX_TUNING:
-        #     return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
-        if kwargs.get("token_type_ids", None) is not None:
-            kwargs["token_type_ids"] = ops.cat(
-                (
-                    ops.zeros(batch_size, peft_config.num_virtual_tokens, dtype=kwargs["token_type_ids"].dtype),
-                    kwargs["token_type_ids"],
-                ),
-                dim=1,
-            )
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        prompts = self.get_prompt(batch_size=batch_size)
-        prompts = prompts.to(inputs_embeds.dtype)
-        inputs_embeds = ops.cat((prompts, inputs_embeds), dim=1)
-        return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
-
-
-class PeftModelForCausalLM(PeftModel):
-    """
-    Peft model for causal language modeling.
-
-    Args:
-        model ([`~mindnlp.models.PreTrainedModel`]): Base transformer model.
-        peft_config ([`PeftConfig`]): Peft config.
-    """
-    def __init__(self, model, peft_config: PeftConfig, adapter_name="default"):
-        r"""
-        Initializes a new instance of the PeftModelForCausalLM class.
-
-        Args:
-            self: The instance itself.
-            model: The underlying model for the adapter.
-            peft_config (PeftConfig): The configuration for the PEFT (Plug and Fine-tune) adapter.
-            adapter_name (str): The name of the adapter. Defaults to 'default'.
-
-        Returns:
-            None. This method does not return any value.
-
-        Raises:
-            N/A
-        """
-        super().__init__(model, peft_config, adapter_name)
-        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        task_ids=None,
-        **kwargs,
-    ):
-        """
-        Forward pass of the model.
-        """
-        peft_config = self.active_peft_config
-        if not isinstance(peft_config, PromptLearningConfig):
-            if self.base_model.config.model_type == "mpt":
-                if inputs_embeds is not None:
-                    raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
-                return self.base_model(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    labels=labels,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                    **kwargs,
-                )
-            if peft_config.peft_type == PeftType.POLY:
-                kwargs["task_ids"] = task_ids
-            return self.base_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                **kwargs,
-            )
-
-        batch_size = input_ids.shape[0]
-        if attention_mask is not None:
-            # concat prompt attention mask
-            prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-            attention_mask = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-
-        if kwargs.get("position_ids", None) is not None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            kwargs["position_ids"] = None
-        if kwargs.get("token_type_ids", None) is not None:
-            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
-            kwargs["token_type_ids"] = None
-        kwargs.update(
-            {
-                "attention_mask": attention_mask,
-                "labels": labels,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-            }
-        )
-
-        if peft_config.peft_type == PeftType.PREFIX_TUNING:
-            past_key_values = self.get_prompt(batch_size)
-            return self.base_model(input_ids=input_ids, past_key_values=past_key_values, **kwargs)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        # concat prompt labels
-        if labels is not None:
-            prefix_labels = ops.full((batch_size, peft_config.num_virtual_tokens), -100)
-            kwargs["labels"] = ops.cat((prefix_labels, labels), dim=1)
-        prompts = self.get_prompt(batch_size=batch_size)
-        prompts = prompts.to(inputs_embeds.dtype)
-        inputs_embeds = ops.cat((prompts, inputs_embeds), dim=1)
-        return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
-
-    def generate(self, **kwargs):
-        """generate."""
-        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
-        if hasattr(self.base_model, "model"):
-            self.base_model.model.generation_config = self.generation_config
-        else:
-            self.base_model.generation_config = self.generation_config
-        try:
-            outputs = self.base_model.generate(**kwargs)
-        except:
-            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-            raise
-
-        self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, task_ids: Optional[mindspore.Tensor] = None, **kwargs,):
-        """prepare_inputs_for_generation."""
-        peft_config = self.active_peft_config
-        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
-        if peft_config.peft_type == PeftType.POLY:
-            model_kwargs["task_ids"] = task_ids
-        if isinstance(peft_config, PromptLearningConfig):
-            if model_kwargs.get("attention_mask", None) is not None:
-                prefix_attention_mask = ops.ones(
-                    model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens)
-                model_kwargs["attention_mask"] = ops.cat(
-                    (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1
-                )
-
-            if model_kwargs.get("position_ids", None) is not None:
-                warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-                model_kwargs["position_ids"] = None
-
-            if kwargs.get("token_type_ids", None) is not None:
-                warnings.warn(
-                    "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
-                )
-                kwargs["token_type_ids"] = None
-
-            if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
-                past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
-                model_kwargs["past_key_values"] = past_key_values
-            else:
-                if model_kwargs["past_key_values"] is None:
-                    inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
-                    prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
-                    prompts = prompts.to(inputs_embeds.dtype)
-                    model_kwargs["inputs_embeds"] = ops.cat((prompts, inputs_embeds), dim=1)
-                    model_kwargs["input_ids"] = None
-
-        return model_kwargs
-
-class PeftModelForSeq2SeqLM(PeftModel):
-    """
-    Peft model for sequence-to-sequence language modeling.
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]): Base transformer model.
-        peft_config ([`PeftConfig`]): Peft config.
-
-    """
-    def __init__(self, model, peft_config: PeftConfig, adapter_name="default"):
-        r"""
-        Initialize a new PeftModelForSeq2SeqLM object.
-
-        Args:
-            self: The instance of the PeftModelForSeq2SeqLM class.
-            model: The model to be used for the PeftModelForSeq2SeqLM.
-            peft_config (PeftConfig): The configuration object for the PeftModelForSeq2SeqLM.
-            adapter_name (str): The name of the adapter to be used, defaults to 'default'.
-
-        Returns:
-            None. This method initializes the PeftModelForSeq2SeqLM object.
-
-        Raises:
-            None.
-        """
-        super().__init__(model, peft_config, adapter_name)
-        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
-        self.base_model_prepare_encoder_decoder_kwargs_for_generation = (
-            self.base_model._prepare_encoder_decoder_kwargs_for_generation
-        )
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        task_ids=None,
-        **kwargs,
-    ):
-        """
-        Forward pass of the model.
-        """
-        peft_config = self.active_peft_config
-        if not isinstance(peft_config, PromptLearningConfig):
-            return self.base_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                decoder_inputs_embeds=decoder_inputs_embeds,
-                labels=labels,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                **kwargs,
-            )
-
-        batch_size = input_ids.shape[0]
-        if decoder_attention_mask is not None:
-            # concat prompt attention mask
-            prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-            decoder_attention_mask = ops.cat((prefix_attention_mask, decoder_attention_mask), dim=1)
-
-        if kwargs.get("position_ids", None) is not None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            kwargs["position_ids"] = None
-        if kwargs.get("token_type_ids", None) is not None:
-            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
-            kwargs["token_type_ids"] = None
-        kwargs.update(
-            {
-                "attention_mask": attention_mask,
-                "decoder_attention_mask": decoder_attention_mask,
-                "labels": labels,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-            }
-        )
-
-        if peft_config.peft_type == PeftType.PREFIX_TUNING:
-            past_key_values = self.get_prompt(batch_size)
-            return self.base_model(
-                input_ids=input_ids, decoder_input_ids=decoder_input_ids, past_key_values=past_key_values, **kwargs
-            )
-        elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
-            if inputs_embeds is None:
-                inputs_embeds = self.word_embeddings(input_ids)
-
-            if attention_mask is not None:
-                # concat prompt attention mask
-                prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-                kwargs["attention_mask"] = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-
-            prompts = self.get_prompt(batch_size=batch_size)
-            prompts = prompts.to(inputs_embeds.dtype)
-            inputs_embeds = ops.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
-
-            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
-        else:
-            if inputs_embeds is None:
-                inputs_embeds = self.word_embeddings(input_ids)
-            if decoder_inputs_embeds is None and decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(
-                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
-                )
-                decoder_inputs_embeds = self.word_embeddings(decoder_input_ids)
-
-            if attention_mask is not None:
-                # concat prompt attention mask
-                prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens, dtype=attention_mask.dtype)
-                kwargs["attention_mask"] = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-            # concat prompt labels
-            if labels is not None:
-                if peft_config.num_transformer_submodules == 1:
-                    kwargs["labels"] = labels
-                elif peft_config.num_transformer_submodules == 2:
-                    prefix_labels = ops.full((batch_size, peft_config.num_virtual_tokens), -100)
-                    kwargs["labels"] = ops.cat((prefix_labels, labels), dim=1)
-            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
-            prompts = prompts.to(inputs_embeds.dtype)
-            inputs_embeds = ops.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
-            if peft_config.num_transformer_submodules == 1:
-                return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
-            elif peft_config.num_transformer_submodules == 2:
-                decoder_inputs_embeds = ops.cat(
-                    (prompts[:, peft_config.num_virtual_tokens :], decoder_inputs_embeds), dim=1
-                )
-                return self.base_model(
-                    inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **kwargs
-                )
-            return None # never go here
-
-    def generate(self, **kwargs):
-        """generate."""
-        peft_config = self.active_peft_config
-        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
-        self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
-            self._prepare_encoder_decoder_kwargs_for_generation
-        )
-        try:
-            if not isinstance(peft_config, PromptLearningConfig):
-                outputs = self.base_model.generate(**kwargs)
-            else:
-                if "input_ids" not in kwargs:
-                    raise ValueError("input_ids must be provided for Peft model generation")
-                if kwargs.get("position_ids", None) is not None:
-                    warnings.warn(
-                        "Position ids are not supported for parameter efficient tuning. Ignoring position ids."
-                    )
-                    kwargs["position_ids"] = None
-                if kwargs.get("token_type_ids", None) is not None:
-                    warnings.warn(
-                        "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
-                    )
-                    kwargs["token_type_ids"] = None
-
-                if peft_config.peft_type == PeftType.PREFIX_TUNING:
-                    outputs = self.base_model.generate(**kwargs)
-                elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
-                    kwargs = deepcopy(kwargs)
-
-                    if "encoder_outputs" in kwargs:
-                        del kwargs["encoder_ouputs"]
-                        warnings.warn(
-                            "`encoder_outputs` should not be passed to `generate` when using prompt tuning. Ignoring it."
-                        )
-
-                    input_ids = kwargs.pop("input_ids")
-                    inputs_embeds = self.word_embeddings(input_ids)
-                    batch_size = inputs_embeds.shape[0]
-                    prompts = self.get_prompt(batch_size=batch_size)
-                    prompts = prompts.to(inputs_embeds.dtype)
-
-                    inputs_embeds = ops.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
-                    kwargs["inputs_embeds"] = inputs_embeds
-
-                    if "attention_mask" in kwargs:
-                        prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-                        kwargs["attention_mask"] = ops.cat((prefix_attention_mask, kwargs["attention_mask"]), dim=1)
-
-                    return self.base_model.generate(**kwargs)
-                else:
-                    raise NotImplementedError
-        except:
-            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-            self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
-                self.base_model_prepare_encoder_decoder_kwargs_for_generation
-            )
-            raise
-        self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-        self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
-            self.base_model_prepare_encoder_decoder_kwargs_for_generation
-        )
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, task_ids: mindspore.Tensor = None, **kwargs):
-        """prepare inputs for generation"""
-        peft_config = self.active_peft_config
-        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
-        if peft_config.peft_type == PeftType.POLY:
-            model_kwargs["task_ids"] = task_ids
-        if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
-            batch_size = model_kwargs["decoder_input_ids"].shape[0]
-            past_key_values = self.get_prompt(batch_size)
-            model_kwargs["past_key_values"] = past_key_values
-
-        return model_kwargs
-
-class PeftModelForTokenClassification(PeftModel):
-    """
-    Peft model for token classification tasks.
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]): Base transformer model.
-        peft_config ([`PeftConfig`]): Peft config.
-    """
-    def __init__(self, model, peft_config: PeftConfig = None, adapter_name="default"):
-        r"""
-        Initializes a new instance of the PeftModelForTokenClassification class.
-
-        Args:
-            self: The instance of the PeftModelForTokenClassification class.
-            model: The model used for token classification.
-            peft_config (PeftConfig, optional): The configuration for the Peft model. Defaults to None.
-            adapter_name (str, optional): The name of the adapter. Defaults to 'default'.
-
-        Returns:
-            None. This method does not return a value.
-
-        Raises:
-            N/A
-        """
-        super().__init__(model, peft_config, adapter_name)
-        if self.modules_to_save is None:
-            self.modules_to_save = {"classifier", "score"}
-        else:
-            self.modules_to_save.update({"classifier", "score"})
-
-        for name, _ in self.base_model.named_modules():
-            if any(module_name in name for module_name in self.modules_to_save):
-                self.cls_layer_name = name
-                break
-
-        # to make sure classifier layer is trainable
-        _set_trainable(self, adapter_name)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        task_ids=None,
-        **kwargs,
-    ):
-        """
-        Forward pass of the model.
-        """
-        peft_config = self.active_peft_config
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if not isinstance(peft_config, PromptLearningConfig):
-            if peft_config.peft_type == PeftType.POLY:
-                kwargs["task_ids"] = task_ids
-            return self.base_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                **kwargs,
-            )
-
-        batch_size = input_ids.shape[0]
-        if attention_mask is not None:
-            # concat prompt attention mask
-            prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-            attention_mask = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-        if kwargs.get("position_ids", None) is not None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            kwargs["position_ids"] = None
-        kwargs.update(
-            {
-                "attention_mask": attention_mask,
-                "labels": labels,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-            }
-        )
-
-        if peft_config.peft_type == PeftType.PREFIX_TUNING:
-            return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
-        else:
-            if kwargs.get("token_type_ids", None) is not None:
-                kwargs["token_type_ids"] = ops.cat(
-                    (
-                        ops.zeros(batch_size, peft_config.num_virtual_tokens),
-                        kwargs["token_type_ids"],
-                    ),
-                    dim=1,
-                ).long()
-            if inputs_embeds is None:
-                inputs_embeds = self.word_embeddings(input_ids)
-            prompts = self.get_prompt(batch_size=batch_size)
-            prompts = prompts.to(inputs_embeds.dtype)
-            inputs_embeds = ops.cat((prompts, inputs_embeds), dim=1)
-            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
-
-    def _prefix_tuning_forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        r"""
-        Performs the forward pass for the prefix tuning task in the PeftModelForTokenClassification class.
-
-        Args:
-            self (PeftModelForTokenClassification): The instance of the PeftModelForTokenClassification class.
-            input_ids (torch.Tensor): The input token IDs tensor of shape [batch_size, sequence_length].
-            attention_mask (torch.Tensor): The attention mask tensor of shape [batch_size, sequence_length].
-            inputs_embeds (torch.Tensor): The input embeddings tensor of shape [batch_size, sequence_length, hidden_size].
-            labels (torch.Tensor): The labels tensor of shape [batch_size, sequence_length].
-            output_attentions (bool): Whether to output attentions. Defaults to None.
-            output_hidden_states (bool): Whether to output hidden states. Defaults to None.
-            return_dict (bool): Whether to return a dictionary. Defaults to None.
-
-        Returns:
-            None: This method does not return any value. Instead, it updates the internal state of the model.
-
-        Raises:
-            ValueError: If the model does not support past key values which are required for prefix tuning.
-
-        """
-        batch_size = input_ids.shape[0]
-        past_key_values = self.get_prompt(batch_size)
-        fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
-        kwargs.update(
-            {
-                "input_ids": input_ids,
-                "attention_mask": attention_mask,
-                "inputs_embeds": inputs_embeds,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-                "past_key_values": past_key_values,
-            }
-        )
-        if "past_key_values" in fwd_params:
-            return self.base_model(labels=labels, **kwargs)
-        else:
-            transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
-            fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
-            if "past_key_values" not in fwd_params:
-                raise ValueError("Model does not support past key values which are required for prefix tuning.")
-            outputs = transformer_backbone_name(**kwargs)
-            sequence_output = outputs[0]
-            if "dropout" in [name for name, _ in list(self.base_model.named_modules())]:
-                sequence_output = self.base_model.dropout(sequence_output)
-            logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output)
-
-            loss = None
-            if labels is not None:
-                loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
-
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-
-class PeftModelForFeatureExtraction(PeftModel):
-    """
-    Peft model for extracting features/embeddings from transformer models
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]): Base transformer model.
-        peft_config ([`PeftConfig`]): Peft config.
-        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
-        autocast_adapter_dtype (`bool`, *optional*):
-            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
-            using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect
-            select PEFT tuners.
-
-    **Attributes**:
-        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
-
-    Example:
-
-        ```py
-        >>> from transformers import AutoModel
-        >>> from peft import PeftModelForFeatureExtraction, get_peft_config
-
-        >>> config = {
-        ...     "peft_type": "LORA",
-        ...     "task_type": "FEATURE_EXTRACTION",
-        ...     "inference_mode": False,
-        ...     "r": 16,
-        ...     "target_modules": ["query", "value"],
-        ...     "lora_alpha": 32,
-        ...     "lora_dropout": 0.05,
-        ...     "fan_in_fan_out": False,
-        ...     "bias": "none",
-        ... }
-        >>> peft_config = get_peft_config(config)
-        >>> model = AutoModel.from_pretrained("bert-base-cased")
-        >>> peft_model = PeftModelForFeatureExtraction(model, peft_config)
-        >>> peft_model.print_trainable_parameters()
-        ```
-    """
-
-    def __init__(self, model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs):
-        super().__init__(model, peft_config, adapter_name, **kwargs)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        task_ids=None,
-        **kwargs,
-    ):
-        peft_config = self.active_peft_config
-        if not peft_config.is_prompt_learning:
-            if peft_config.peft_type == PeftType.POLY:
-                kwargs["task_ids"] = task_ids
-
-            with self._enable_peft_forward_hooks(**kwargs):
-                kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-                return self.base_model(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    inputs_embeds=inputs_embeds,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=return_dict,
-                    **kwargs,
-                )
-
-        batch_size = _get_batch_size(input_ids, inputs_embeds)
-        if attention_mask is not None:
-            # concat prompt attention mask
-            prefix_attention_mask = ops.ones(batch_size, peft_config.num_virtual_tokens)
-            attention_mask = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-
-        if kwargs.get("position_ids", None) is not None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            kwargs["position_ids"] = None
-        if kwargs.get("token_type_ids", None) is not None:
-            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
-            kwargs["token_type_ids"] = None
-        kwargs.update(
-            {
-                "attention_mask": attention_mask,
-                "output_attentions": output_attentions,
-                "output_hidden_states": output_hidden_states,
-                "return_dict": return_dict,
-            }
-        )
-
-        if peft_config.peft_type == PeftType.PREFIX_TUNING:
-            # overwrite past_kv in kwargs
-            kwargs["past_key_values"] = self.get_prompt(batch_size)
-            return self.base_model(input_ids=input_ids, **kwargs)
-        else:
-            if inputs_embeds is None:
-                inputs_embeds = self.word_embeddings(input_ids)
-            prompts = self.get_prompt(batch_size=batch_size)
-            prompts = prompts.to(inputs_embeds.dtype)
-            inputs_embeds = ops.cat((prompts, inputs_embeds), dim=1)
-            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
diff --git a/mindnlp/peft/tuners/__init__.py b/mindnlp/peft/tuners/__init__.py
deleted file mode 100644
index cd6163673..000000000
--- a/mindnlp/peft/tuners/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tuners"""
-
-from .lora import LoraConfig, LoraModel
-from .ia3 import IA3Config, IA3Model
-from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
-from .adalora import AdaLoraConfig, AdaLoraModel
-from .lokr import LoKrConfig, LoKrModel
-from .loha import LoHaConfig, LoHaModel
-from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
-from .prefix_tuning import PrefixTuningConfig, PrefixEncoder
-from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
-from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
-from .poly import PolyConfig, PolyModel
-from .ln_tuning import LNTuningConfig, LNTuningModel
diff --git a/mindnlp/peft/tuners/adalora/__init__.py b/mindnlp/peft/tuners/adalora/__init__.py
deleted file mode 100644
index 274d6503f..000000000
--- a/mindnlp/peft/tuners/adalora/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""AdaLora"""
-
-from .config import AdaLoraConfig
-from .layer import AdaLoraLayer, RankAllocator, SVDLinear
-from .model import AdaLoraModel
-
-
-__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "SVDLinear", "RankAllocator"]
diff --git a/mindnlp/peft/tuners/adalora/config.py b/mindnlp/peft/tuners/adalora/config.py
deleted file mode 100644
index a65ba0c6f..000000000
--- a/mindnlp/peft/tuners/adalora/config.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=line-too-long
-# pylint: disable=invalid-name
-# pylint: disable=too-many-instance-attributes
-"Adalora Config"
-from dataclasses import dataclass, field
-from typing import Optional
-
-from ..lora import LoraConfig
-from ...utils import PeftType
-
-
-@dataclass
-class AdaLoraConfig(LoraConfig):
-    """
-    This is the configuration class to store the configuration of a [`~peft.AdaLora`].
-
-    Args:
-        target_r (`int`): The target average rank of incremental matrix.
-        init_r (`int`): The initial rank for each incremental matrix.
-        tinit (`int`): The steps of initial fine-tuning warmup.
-        tfinal (`int`): The step of final fine-tuning.
-        deltaT (`int`): The time internval between two budget allocations.
-        beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing.
-        beta2 (`float`): The hyperparameter of EMA for undertainty quantification.
-        orth_reg_weight (`float`): The coefficient of orthogonal regularization.
-        total_step (`int`): The total training steps that should be specified before training.
-        rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator.
-    """
-    target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."})
-    init_r: int = field(default=12, metadata={"help": "Initial Lora matrix dimension."})
-    tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."})
-    tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."})
-    deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."})
-    beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
-    beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
-    orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."})
-    total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."})
-    rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."})
-
-    def __post_init__(self):
-        r"""
-        Performs post-initialization actions for the AdaLoraConfig class.
-        
-        Args:
-            self (AdaLoraConfig): The instance of the AdaLoraConfig class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        self.peft_type = PeftType.ADALORA
diff --git a/mindnlp/peft/tuners/adalora/layer.py b/mindnlp/peft/tuners/adalora/layer.py
deleted file mode 100644
index e4045a952..000000000
--- a/mindnlp/peft/tuners/adalora/layer.py
+++ /dev/null
@@ -1,624 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"Adalora Layer"
-import warnings
-from typing import Any, List, Optional
-
-from mindspore import Tensor
-
-from mindnlp.core import nn, ops, no_grad
-from mindnlp.core.nn import Parameter
-from mindnlp.core.nn import ParameterDict, ModuleDict
-from mindnlp.peft.utils import transpose
-from mindnlp.transformers.ms_utils import Conv1D
-
-from ..tuners_utils import check_adapters_to_merge, BaseTunerLayer
-
-
-class AdaLoraLayer(BaseTunerLayer):
-    "AdaLoraLayer class for AdaLoraModel."
-    # List all names of layers that may contain adapter weights
-    # Note: ranknum doesn't need to be included as it is not an nn.Module
-    adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
-    # other_param_names is defined in LoraLayer
-
-    def __init__(self, base_layer: nn.Module) -> None:
-        r"""
-        Initializes an instance of the AdaLoraLayer class.
-        
-        Args:
-            self: The instance of the AdaLoraLayer class.
-            base_layer (nn.Module): The base layer to be used for the AdaLoraLayer. It can be a Dense, Conv2d, Embedding, or Conv1D layer.
-                                  For Dense and Conv2d layers, it extracts the input and output channel dimensions.
-                                  For Embedding layers, it extracts the vocabulary size and embedding size.
-                                  For Conv1D layers, it extracts the weight shape if available, otherwise the weight shape.
-                                  Any other layer type will raise a ValueError.
-                                  The base_layer is used to initialize the in_features and out_features attributes of the AdaLoraLayer.
-        
-        Returns:
-            None
-        
-        Raises:
-            ValueError: If the base_layer is not one of the supported layer types.
-        """
-        self.base_layer = base_layer
-        self.r = {}
-        self.lora_alpha = {}
-        self.scaling = {}
-        self.lora_dropout = ModuleDict()
-        self.lora_E = ParameterDict({})
-        self.lora_A = ParameterDict({})
-        self.lora_B = ParameterDict({})
-        self.ranknum = ParameterDict({})
-        # For Embedding layer
-        self.lora_embedding_A = ModuleDict()
-        self.lora_embedding_B = ModuleDict()
-        if isinstance(base_layer, nn.Linear):
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif isinstance(base_layer, nn.Conv2d):
-            in_features, out_features = base_layer.in_channels, base_layer.out_channels
-        elif isinstance(base_layer, nn.Embedding):
-            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
-        elif isinstance(base_layer, Conv1D):
-            in_features, out_features = (
-                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
-            )
-        else:
-            raise ValueError(f"Unsupported layer type {type(base_layer)}")
-        self.in_features = in_features
-        self.out_features = out_features
-
-    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
-        r"""
-        This method updates the AdaLoraLayer with the provided parameters.
-        
-        Args:
-            self (object): The instance of the AdaLoraLayer class.
-            adapter_name (str): The name of the adapter to be updated.
-            r (int): The rank of the adapter. Should be a positive integer or 0.
-            lora_alpha (float): The alpha value for Lora scaling.
-            lora_dropout (float): The dropout probability for Lora. Should be greater than 0.0.
-            init_lora_weights (bool): If True, initializes Lora weights.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            ValueError: If the value of 'r' is less than 0, a ValueError is raised.
-        """
-        if r < 0:
-            # note: r == 0 is allowed for AdaLora, see #1539
-            raise ValueError(f"`r` should be a positive integer or 0, but the value passed is {r}")
-
-        self.r[adapter_name] = r
-        self.lora_alpha[adapter_name] = lora_alpha
-        if lora_dropout > 0.0:
-            lora_dropout_layer = nn.Dropout(p=lora_dropout)
-        else:
-            lora_dropout_layer = nn.Identity()
-
-        self.lora_dropout[adapter_name] = lora_dropout_layer
-        # Actual trainable parameters
-        # Right singular vectors
-        if r > 0:
-            weight_A = ops.randn((r, self.in_features))
-            weight_E = ops.randn((r, 1))
-            weight_B = ops.randn((self.out_features, r))
-        else:
-            rank_idx = Tensor([False])
-            weight_A = ops.randn((1, self.in_features))
-            weight_E = ops.randn((1, 1))
-            weight_B = ops.randn((self.out_features, 1))
-            weight_A = weight_A[rank_idx, :]
-            weight_E = weight_E[rank_idx, :]
-            weight_B = weight_B[:, rank_idx]
-        self.lora_A.update({adapter_name: Parameter(weight_A)})
-        # Singular values
-        self.lora_E.update({adapter_name: Parameter(weight_E)})
-        # Left singular vectors
-        self.lora_B.update({adapter_name: Parameter(weight_B)})
-        # The current rank
-        self.ranknum.update({adapter_name: Parameter(Tensor(float(r)), requires_grad=False)})
-        self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r)
-        if init_lora_weights and r > 0:
-            self.reset_lora_parameters(adapter_name)
-        self.set_adapter(self.active_adapters)
-
-    def reset_lora_parameters(self, adapter_name):
-        if adapter_name in self.lora_A.keys():
-            nn.init.zeros_(self.lora_E[adapter_name])
-            nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02)
-            nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)
-
-
-class SVDLinear(nn.Module, AdaLoraLayer):
-    "SVD-based adaptation by a dense layer"
-    # SVD-based adaptation by a dense layer
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        fan_in_fan_out: bool = False,
-        init_lora_weights: bool = True,
-        **kwargs,
-    ) -> None:
-        r"""
-        Initializes an instance of the SVDLinear class.
-        
-        Args:
-            self: The object itself.
-            base_layer (nn.Module): The base layer of the SVDLinear model.
-            adapter_name (str): The name of the adapter.
-            r (int, optional): The number of singular values to keep. Defaults to 0.
-            lora_alpha (int, optional): The alpha value for the LORA algorithm. Defaults to 1.
-            lora_dropout (float, optional): The dropout rate for the LORA algorithm. Defaults to 0.0.
-            fan_in_fan_out (bool, optional): Indicates whether to use fan-in/fan-out scaling. Defaults to False.
-            init_lora_weights (bool, optional): Indicates whether to initialize the LORA weights. Defaults to True.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None. This method does not raise any exceptions.
-        """
-        super().__init__()
-        AdaLoraLayer.__init__(self, base_layer)
-        # Freezing the pre-trained weight matrix
-        self.get_base_layer().requires_grad = False
-
-        self.fan_in_fan_out = fan_in_fan_out
-        self._active_adapter = adapter_name
-        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            base_layer = self.get_base_layer()
-            if active_adapter in self.lora_A.keys():
-                if safe_merge:
-                    # Note that safe_merge will be slower than the normal merge
-                    # because of the copy operation.
-                    orig_weights = base_layer.weight.data.clone()
-                    orig_weights += self.get_delta_weight(active_adapter)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = orig_weights
-                else:
-                    base_layer.weight.data += self.get_delta_weight(active_adapter)
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.lora_A.keys():
-                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
-
-    def get_delta_weight(self, adapter) -> Tensor:
-        r"""
-        Calculates the delta weight for a given adapter in the SVDLinear class.
-        
-        Args:
-            self (SVDLinear): An instance of the SVDLinear class.
-            adapter: The adapter index for which the delta weight needs to be calculated. 
-        
-        Returns:
-            Tensor: A tensor representing the delta weight for the specified adapter.
-        
-        Raises:
-            None.
-        
-        This method calculates the delta weight for a specific adapter in the SVDLinear class. The delta weight is computed using the following formula:
-        
-            delta_weight = transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out) * self.scaling[adapter] / (self.ranknum[adapter] + 1e-05)
-        
-        The method returns the calculated delta weight as a Tensor object.
-        """
-        return (
-            transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out)
-            * self.scaling[adapter]
-            / (self.ranknum[adapter] + 1e-5)
-        )
-
-    def forward(self, x: Tensor, *args: Any, **kwargs: Any) -> Tensor:
-        r"""Constructs a tensor using the SVDLinear method.
-        
-        Args:
-            self: An instance of the SVDLinear class.
-            x (Tensor): The input tensor for the forward method.
-        
-        Returns:
-            Tensor: The forwarded tensor.
-        
-        Raises:
-            None.
-        """
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.lora_A.keys():
-                    continue
-                lora_A = self.lora_A[active_adapter]
-                lora_B = self.lora_B[active_adapter]
-                lora_E = self.lora_E[active_adapter]
-                dropout = self.lora_dropout[active_adapter]
-                scaling = self.scaling[active_adapter]
-                ranknum = self.ranknum[active_adapter] + 1e-5
-
-                x = x.to(lora_A.dtype)
-                result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
-
-        return result
-
-    def __repr__(self) -> str:
-        r"""
-        This method returns a string representation of the object.
-        
-        Args:
-            self: SVDLinear instance. Represents the current instance of the SVDLinear class.
-        
-        Returns:
-            str: A string representation of the object, prefixed with 'adalora.'.
-        
-        Raises:
-            No specific exceptions are raised within this method.
-        """
-        rep = super().__repr__()
-        return "adalora." + rep
-
-
-class RankAllocator:
-    """
-    The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY
-
-    Args:
-        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
-        model: the model that we apply AdaLoRA to.
-
-    """
-    def __init__(self, model, peft_config, adapter_name):
-        """
-        Initializes a RankAllocator instance.
-        
-        Args:
-            self: The RankAllocator instance.
-            model: The model to be used for rank allocation.
-            peft_config: The PEFT configuration object containing beta1 and beta2 values.
-            adapter_name: The name of the adapter.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            AssertionError: If the beta1 or beta2 values in peft_config are not within the range (0, 1).
-        """
-        self.peft_config = peft_config
-        self.adapter_name = adapter_name
-        self.beta1 = peft_config.beta1
-        self.beta2 = peft_config.beta2
-        assert self.beta1 > 0 and self.beta1 < 1
-        assert self.beta2 > 0 and self.beta2 < 1
-
-        self.reset_ipt()
-        self._set_budget_scheduler(model)
-
-    def set_total_step(self, total_step):
-        r"""
-        Sets the total number of steps in the RankAllocator.
-        
-        Args:
-            self (RankAllocator): The RankAllocator object.
-            total_step (int): The total number of steps in the RankAllocator. It specifies the maximum number of steps that can be allocated.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        """
-        self.peft_config.total_step = total_step
-
-    def reset_ipt(self):
-        r"""
-        Resets the 'ipt' attribute, along with its associated attributes 'exp_avg_ipt' and 'exp_avg_unc', in the RankAllocator class.
-        
-        Args:
-            self: An instance of the RankAllocator class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        self.ipt = {}
-        self.exp_avg_ipt = {}
-        self.exp_avg_unc = {}
-
-    def _set_budget_scheduler(self, model):
-        r"""
-        This method '_set_budget_scheduler' belongs to the class 'RankAllocator' and is responsible for setting up the budget scheduler based on the provided model.
-        
-        Args:
-            self: Instance of the RankAllocator class. It is used to access and modify the attributes and methods of the class.
-            model: An object representing the model. The method iterates through the parameters and names of the model to calculate the initial budget 'init_bgt' and create a set of names 'name_set'.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            No specific exceptions are documented to be raised by this method. However, potential exceptions could arise from working with the input parameters or during the iteration process.
-        """
-        self.init_bgt = 0
-        self.name_set = set()
-        for n, p in model.parameters_and_names():
-            if f"lora_A.{self.adapter_name}" in n:
-                self.init_bgt += p.data.shape[0]
-                self.name_set.add(n.replace("lora_A", "%s"))
-        self.name_set = sorted(self.name_set)
-        # The total final rank budget
-        self.target_bgt = self.peft_config.target_r * len(self.name_set)
-
-    def budget_schedule(self, step: int):
-        r"""
-        This method calculates the budget and mask indicator based on the given step value.
-        
-        Args:
-            self (RankAllocator): The instance of the RankAllocator class.
-            step (int): The current step for which the budget and mask indicator need to be calculated. It should be a non-negative integer.
-        
-        Returns:
-            tuple: A tuple containing the calculated budget and mask indicator. The budget is an integer representing the budget value, and the mask indicator is a boolean indicating whether the mask should be
-applied.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        tinit = self.peft_config.tinit
-        tfinal = self.peft_config.tfinal
-        total_step = self.peft_config.total_step
-        # Initial warmup
-        if step <= tinit:
-            budget = self.init_bgt
-            mask_ind = False
-        # Final fine-tuning
-        elif step > total_step - tfinal:
-            budget = self.target_bgt
-            mask_ind = True
-        else:
-            # Budget decreasing with a cubic scheduler
-            mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit)
-            budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt)
-            mask_ind = step % self.peft_config.deltaT == 0
-        return budget, mask_ind
-
-    def update_ipt(self, model):
-        # Update the sensitivity and uncertainty for every weight
-        for n, p in model.named_parameters():
-            if "lora_" in n and self.adapter_name in n:
-                if n not in self.ipt:
-                    self.ipt[n] = ops.zeros_like(p)
-                    self.exp_avg_ipt[n] = ops.zeros_like(p)
-                    self.exp_avg_unc[n] = ops.zeros_like(p)
-                with no_grad():
-                    self.ipt[n] = (p * p.grad).abs()
-                    # Sensitivity smoothing
-                    self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n]
-                    # Uncertainty quantification
-                    self.exp_avg_unc[n] = (
-                        self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs()
-                    )
-
-    def _element_score(self, n):
-        r"""
-        This method calculates the element score based on the exponential average input and exponential average uncertainty values.
-        
-        Args:
-            self (RankAllocator): The instance of the RankAllocator class.
-            n (int): The index of the element for which the score needs to be calculated. It should be a non-negative integer.
-        
-        Returns:
-            None: This method does not return any value explicitly but calculates the element score based on the input parameters.
-        
-        Raises:
-            - IndexError: If the index 'n' is out of bounds or negative.
-            - TypeError: If the input values are not of the expected types.
-        """
-        return self.exp_avg_ipt[n] * self.exp_avg_unc[n]
-
-    def _combine_ipt(self, ipt_E, ipt_AB):
-        r"""
-        This method combines two input arrays, ipt_E and ipt_AB, into a single array and returns the resulting sum.
-        
-        Args:
-            self (object): The instance of the RankAllocator class.
-            ipt_E (array-like): An array containing elements to be combined. It should be a 1-dimensional array.
-            ipt_AB (array-like): An array containing elements to be combined. It should be a 2-dimensional array.
-        
-        Returns:
-            array-like: A 1-dimensional array containing the sum of the elements from ipt_E and ipt_AB.
-        
-        Raises:
-            ValueError: If ipt_AB is not a 2-dimensional array.
-            TypeError: If ipt_E is not a valid array-like object.
-            TypeError: If ipt_AB is not a valid array-like object.
-            ValueError: If the shapes of ipt_E and ipt_AB are not compatible for addition.
-        
-        """
-        ipt_AB = ipt_AB.sum(axis=1, keepdims=False)
-        sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1)
-        return sum_ipt
-
-    def mask_to_budget(self, model, budget):
-        r""" 
-        The 'mask_to_budget' method in the class 'RankAllocator' calculates a mask threshold based on the given budget and applies the threshold to mask certain parameters in the model.
-        
-        Args:
-            self: The instance of the RankAllocator class.
-            model: A model object representing the neural network model to be processed. It is expected to have a method 'parameters_and_names()' and 'masked_fill()' to access and modify model parameters.
-            budget: An integer representing the budget for masking parameters. It restricts the number of parameters that can be masked based on their importance scores.
-        
-        Returns:
-            None: This method does not return any value. It modifies the model parameters in place based on the calculated mask threshold and budget.
-        
-        Raises:
-            - KeyError: If the adapter name specified in the method is not found in the model parameters.
-            - ValueError: If the budget provided is not a positive integer.
-            - TypeError: If the input model is not the expected type or format. 
-            - RuntimeError: If there are any runtime errors during the execution of the method.
-        """
-        value_ipt = {}
-        vector_ipt = {}
-        triplet_ipt = {}
-        # Get the importance score for A, E, B
-        for n, p in model.parameters_and_names():
-            if f"lora_A.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                comb_ipt = ops.mean(entry_ipt, dim=1, keepdim=True)
-                name_m = n.replace("lora_A", "%s")
-                if name_m not in vector_ipt:
-                    vector_ipt[name_m] = [comb_ipt]
-                else:
-                    vector_ipt[name_m].append(comb_ipt)
-            if f"lora_B.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                comb_ipt = ops.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1)
-                name_m = n.replace("lora_B", "%s")
-                if name_m not in vector_ipt:
-                    vector_ipt[name_m] = [comb_ipt]
-                else:
-                    vector_ipt[name_m].append(comb_ipt)
-            if f"lora_E.{self.adapter_name}" in n:
-                entry_ipt = self._element_score(n)
-                name_m = n.replace("lora_E", "%s")
-                value_ipt[name_m] = entry_ipt
-
-        all_score = []
-        # Calculate the score for each triplet
-        for name_m in vector_ipt:
-            ipt_E = value_ipt[name_m]
-            ipt_AB = ops.cat(vector_ipt[name_m], dim=1)
-            sum_ipt = self._combine_ipt(ipt_E, ipt_AB)
-            name_E = name_m % "lora_E"
-            triplet_ipt[name_E] = sum_ipt.view(-1, 1)
-            all_score.append(sum_ipt.view(-1))
-
-        # Get the threshold by ranking ipt
-        mask_threshold = ops.topk(
-            ops.cat(all_score),
-            k=self.init_bgt - budget,
-            largest=False
-        )[0][self.init_bgt - budget-1].item()
-
-        rank_pattern = {}
-        # Mask the unimportant triplets
-        for n, p in model.parameters_and_names():
-            if f"lora_E.{self.adapter_name}" in n:
-                p.masked_fill(triplet_ipt[n] <= mask_threshold, 0.0)
-                rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).asnumpy().tolist()
-        return rank_pattern
-
-    def update_and_allocate(self, model, global_step, force_mask=False):
-        # # Update the importance score and allocate the budget
-        if global_step < self.peft_config.total_step - self.peft_config.tfinal:
-            self.update_ipt(model)
-        budget, mask_ind = self.budget_schedule(global_step)
-        # Allocate the budget according to importance scores
-        if mask_ind or force_mask:
-            rank_pattern = self.mask_to_budget(model, budget)
-        else:
-            rank_pattern = None
-        return budget, rank_pattern
-
-    def mask_using_rank_pattern(self, model, rank_pattern):
-        r""" 
-            Applies a mask to the model parameters based on the provided rank pattern.
-        
-            Args:
-                self (RankAllocator): The instance of the RankAllocator class.
-                model: The model containing the parameters to be masked.
-                rank_pattern: A dictionary containing the rank pattern used for masking the parameters.
-                              The keys of the dictionary represent parameter names, and the corresponding
-                              values are the mask patterns.
-        
-            Returns:
-                None. The method modifies the model parameters in-place.
-        
-            Raises:
-                None.
-        """
-        def mask_using_rank_pattern(self, model, rank_pattern):
-            """
-            Applies a mask to the model parameters based on the provided rank pattern.
-        
-            Args:
-                self (RankAllocator): The instance of the RankAllocator class.
-                model: The model containing the parameters to be masked.
-                rank_pattern: A dictionary containing the rank pattern used for masking the parameters.
-                              The keys of the dictionary represent parameter names, and the corresponding
-                              values are the mask patterns.
-        
-            Returns:
-                None. The method modifies the model parameters in-place.
-        
-            Raises:
-                None.
-            """
-        # Mask the unimportant triplets
-        is_adapter_name_truncated = False
-        if self.adapter_name not in next(iter(rank_pattern.keys())):
-            is_adapter_name_truncated = True
-
-        for n, p in model.parameters_and_names():
-            if f"lora_E.{self.adapter_name}" in n:
-                key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "")
-                mask = Tensor(rank_pattern[key]).unsqueeze(-1)
-                p.masked_fill(~mask.bool(), 0.0)
diff --git a/mindnlp/peft/tuners/adalora/model.py b/mindnlp/peft/tuners/adalora/model.py
deleted file mode 100644
index 2cebc4c95..000000000
--- a/mindnlp/peft/tuners/adalora/model.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"Adalora Model"
-import warnings
-from mindspore import Tensor
-from mindnlp.core.nn import Parameter
-
-from mindnlp.core import nn, ops
-from mindnlp.transformers.ms_utils import Conv1D
-from mindnlp.peft.tuners.lora import LoraConfig, LoraModel
-from mindnlp.peft.utils import (
-    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
-    _freeze_adapter,
-    _get_submodules,
-)
-
-from ..tuners_utils import BaseTunerLayer
-from .layer import AdaLoraLayer, RankAllocator, SVDLinear
-
-
-class AdaLoraModel(LoraModel):
-    """
-    Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper:
-    https://openreview.net/forum?id=lq62uWRJjiY
-
-    Args:
-        model ([`mindspore.nn.Module`]): The model to be adapted.
-        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        AdaLoraModel ([`mindspore.nn.Module`]): The AdaLora model.
-
-    Example::
-
-        >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import AdaLoraModel, AdaLoraConfig
-        >>> config = AdaLoraConfig(
-                peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"],
-                lora_dropout=0.01,
-            )
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default")
-
-    > **Attributes**:  
-
-    >   - **model** ([`transformers.PreTrainedModel`])— The model to be adapted. 
-
-    >   - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model. 
-    """
-    # Note: don't redefine prefix here, it should be inherited from LoraModel
-
-    def __init__(self, model, config, adapter_name):
-        r"""
-        Initializes an instance of the AdaLoraModel class.
-        
-        Args:
-            self (AdaLoraModel): The current instance of the AdaLoraModel.
-            model: The underlying model to be used.
-            config: The configuration object for the AdaLoraModel.
-            adapter_name: The name of the adapter to be used.
-        
-        Returns:
-            None.
-        
-        Raises:
-            ValueError: If more than one trainable adapter is specified.
-            TypeError: If the adapter specified by 'adapter_name' is not in the configuration.
-            AttributeError: If the specified adapter is in inference mode.
-        """
-        super().__init__(model, config, adapter_name)
-
-        traininable_mode_counter = 0
-        for peft_config in self.peft_config.values():
-            if not peft_config.inference_mode:
-                traininable_mode_counter += 1
-
-        if traininable_mode_counter > 1:
-            raise ValueError(
-                "AdaLoraModel supports only 1 trainable adapter. "
-                "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train."
-            )
-
-        if self.peft_config[adapter_name].inference_mode:
-            _freeze_adapter(self.model, adapter_name)
-        else:
-            self.trainable_adapter_name = adapter_name
-            self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name)
-
-    def _check_new_adapter_config(self, config: LoraConfig) -> None:
-        """
-        A helper method to check the config when a new adapter is being added.
-
-        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
-
-        """
-        super()._check_new_adapter_config(config)
-
-        traininable_mode_counter = 0
-        for config_ in self.peft_config.values():
-            if not config_.inference_mode:
-                traininable_mode_counter += 1
-
-        if traininable_mode_counter > 1:
-            raise ValueError(
-                f"{self.__class__.__name__} supports only 1 trainable adapter. "
-                "When using multiple adapters, set inference_mode to True for all adapters except the one "
-                "you want to train."
-            )
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        """
-        Marks only specific adapters in the model as trainable based on the specified bias configuration.
-        
-        Args:
-            self: The instance of the AdaLoraModel class.
-            model (nn.Module): The neural network model for which adapters should be marked as trainable.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            NotImplementedError: If the requested bias configuration is not implemented.
-        """
-        for n, p in model.parameters_and_names():
-            if "lora_" not in n:
-                p.requires_grad = False
-
-        for active_adapter in self.active_adapters:
-            bias = self.peft_config[active_adapter].bias
-            if bias == "none":
-                continue
-
-            if bias == "all":
-                for n, p in model.parameters_and_names():
-                    if "bias" in n:
-                        p.requires_grad = True
-            elif bias == "lora_only":
-                for m in model.modules():
-                    if isinstance(m, AdaLoraLayer) and hasattr(m, "bias") and m.bias is not None:
-                        m.bias.requires_grad = True
-            else:
-                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
-    def _create_and_replace(
-        self,
-        lora_config,
-        adapter_name,
-        target,
-        target_name,
-        parent,
-        current_key,
-        **optionnal_kwargs,
-    ):
-        r"""
-        This method '_create_and_replace' is defined within the 'AdaLoraModel' class and is responsible for creating and replacing a cell based on the provided parameters.
-        
-        Args:
-            self (object): The instance of the 'AdaLoraModel' class.
-            lora_config (object): An object containing LoRa configuration parameters.
-            adapter_name (str): The name of the adapter.
-            target (object): The target object on which the cell will be created and replaced.
-            target_name (str): The name of the target.
-            parent (object): The parent object where the cell will be replaced.
-            current_key: Additional optional keyword arguments.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            TypeError: If the 'target' parameter is not an instance of the 'AdaLoraLayer' class.
-            Exception: Any other unexpected exceptions may be raised during the execution of this method.
-        """
-        kwargs = {
-            "r": lora_config.init_r,
-            "lora_alpha": lora_config.lora_alpha,
-            "lora_dropout": lora_config.lora_dropout,
-            "fan_in_fan_out": lora_config.fan_in_fan_out,
-            "init_lora_weights": lora_config.init_lora_weights,
-        }
-        kwargs["loaded_in_8bit"] = optionnal_kwargs.pop("loaded_in_8bit", False)
-        kwargs["loaded_in_4bit"] = optionnal_kwargs.pop("loaded_in_4bit", False)
-        # if (kwargs["loaded_in_8bit"] or kwargs["loaded_in_4bit"]) and not is_bnb_available():
-        #     raise ImportError(
-        #         "To use AdaLora with 8-bit quantization, please install the `bitsandbytes` package. "
-        #         "You can install it with `pip install bitsandbytes`."
-        #     )
-        # quantization_config = get_quantization_config(self.model, method="gptq")
-        # if quantization_config is not None:
-        #     kwargs["gptq_quantization_config"] = quantization_config
-
-        # If it is not an AdaLoraLayer, create a new cell, else update it with new adapters
-        if not isinstance(target, AdaLoraLayer):
-            new_cell = self._create_new_cell(lora_config, adapter_name, target, **kwargs)
-            self._replace_cell(parent, target_name, new_cell, target)
-        else:
-            target.update_layer(
-                adapter_name,
-                lora_config.init_r,
-                lora_config.lora_alpha,
-                lora_config.lora_dropout,
-                lora_config.init_lora_weights,
-            )
-
-    @staticmethod
-    def _create_new_cell(lora_config, adapter_name, target, **kwargs):
-        r"""
-        This method creates a new cell for the AdaLoraModel.
-        
-        Args:
-            lora_config (LoraConfig): The configuration for the LoRa model.
-            adapter_name (str): The name of the adapter.
-            target (Union[BaseTunerLayer, nn.Module]): The target layer for which the new cell is being created.
-        
-        Returns:
-            None. This method returns None.
-        
-        Raises:
-            - ValueError: If the target cell is not supported. Currently, only `torch.nn.Linear` and `Conv1D` are supported.
-            - Warning: If the 'fan_in_fan_out' parameter needs to be adjusted based on the type of the target cell.
-        """
-        # avoid eager bnb import
-        # if is_bnb_available():
-        #     import bitsandbytes as bnb
-
-        #     from .bnb import SVDLinear8bitLt
-        # if is_bnb_4bit_available():
-        #     from .bnb import SVDLinear4bit
-
-        # gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
-        # AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
-
-        # loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
-        # loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
-
-        if isinstance(target, BaseTunerLayer):
-            target_base_layer = target.get_base_layer()
-        else:
-            target_base_layer = target
-
-        # if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
-        #     kwargs.update(
-        #         {
-        #             "has_fp16_weights": target_base_layer.state.has_fp16_weights,
-        #             "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
-        #             "threshold": target_base_layer.state.threshold,
-        #             "index": target_base_layer.index,
-        #         }
-        #     )
-        #     new_cell = SVDLinear8bitLt(target, adapter_name, **kwargs)
-        # elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit):
-        #     fourbit_kwargs = kwargs.copy()
-        #     fourbit_kwargs.update(
-        #         {
-        #             "compute_dtype": target_base_layer.compute_dtype,
-        #             "compress_statistics": target_base_layer.weight.compress_statistics,
-        #             "quant_type": target_base_layer.weight.quant_type,
-        #         }
-        #     )
-        #     new_cell = SVDLinear4bit(target, adapter_name, **fourbit_kwargs)
-        # elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
-        #     new_cell = SVDQuantLinear(target, adapter_name, **kwargs)
-        if isinstance(target_base_layer, nn.Linear):
-            if kwargs["fan_in_fan_out"]:
-                warnings.warn(
-                    "fan_in_fan_out is set to True but the target cell is `torch.nn.Linear`. "
-                    "Setting fan_in_fan_out to False."
-                )
-                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
-        elif isinstance(target_base_layer, Conv1D):
-            if not kwargs["fan_in_fan_out"]:
-                warnings.warn(
-                    "fan_in_fan_out is set to False but the target cell is `Conv1D`. "
-                    "Setting fan_in_fan_out to True."
-                )
-                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
-        else:
-            raise ValueError(
-                f"Target cell {target} is not supported. "
-                f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
-            )
-        new_cell = SVDLinear(target, adapter_name, **kwargs)
-
-        return new_cell
-    def _replace_cell(self, parent, child_name, new_cell, child):
-        r"""
-        This method '_replace_cell' is defined within the 'AdaLoraModel' class.
-        It replaces a cell within the model with a new cell, transferring relevant attributes from the original cell to the new cell.
-        
-        Args:
-        - self (object): The instance of the AdaLoraModel class.
-        - parent (object): The parent object where the cell is to be replaced.
-        - child_name (str): The name of the child attribute within the parent object.
-        - new_cell (object): The new cell object that will replace the original cell.
-        - child (object): The original cell object that is being replaced.
-        
-        Returns:
-        None. This method does not return any value.
-        
-        Raises:
-        This method does not explicitly raise any exceptions. However, it may raise AttributeError if the attributes being accessed do not exist in the provided objects.
-        """
-        setattr(parent, child_name, new_cell)
-
-        # child layer wraps the original cell, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        # layers with base_layer don't need the weight to be copied, as they have a reference already
-        if not hasattr(new_cell, "base_layer"):
-            new_cell.weight = child.weight
-            if hasattr(child, "bias"):
-                new_cell.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        r"""
-        This method '_prepare_adapter_config' in the class 'AdaLoraModel' prepares the adapter configuration based on the provided 'peft_config' and 'model_config' parameters.
-        
-        Args:
-        - peft_config (dict): A dictionary containing the configuration details for the adapter. It should include information about the target modules. If 'target_modules' is not specified, it is inferred based
-on the 'model_type' from the 'model_config' parameter.
-        - model_config (dict): A dictionary containing the configuration details specific to the model. It is used to determine the 'model_type' which is then used to infer the 'target_modules' if not explicitly
-provided in 'peft_config'.
-        
-        Returns:
-        None: This method does not return any value but updates the 'peft_config' parameter with the inferred or provided 'target_modules' based on the 'model_type'.
-        
-        Raises:
-        - ValueError: Raised if 'target_modules' is not specified in 'peft_config' and the 'model_type' from 'model_config' does not have a corresponding mapping in
-TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING.
-        """
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[
-                model_config["model_type"]
-            ]
-        return peft_config
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def forward(self, *args, **kwargs):
-        """The forward method of the model"""
-        outputs = self.model(*args, **kwargs)
-
-        if (getattr(outputs, "loss", None) is not None) and isinstance(outputs.loss, Tensor):
-            # Calculate the orthogonal regularization
-            orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight
-
-            if orth_reg_weight <= 0:
-                raise ValueError("orth_reg_weight should be greater than 0. ")
-
-            regu_loss = 0
-            num_param = 0
-            for n, p in self.model.parameters_and_names():
-                if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n:
-                    para_cov = p @ p.T if "lora_A" in n else p.T @ p
-                    I = ops.eye(*para_cov.shape)  # noqa: E741
-                    I = ops.stop_gradient(I)
-                    num_param += 1
-                    regu_loss += ops.norm(para_cov - I, p="fro")
-            if num_param > 0:
-                regu_loss = regu_loss / num_param
-            else:
-                regu_loss = 0
-            outputs.loss += orth_reg_weight * regu_loss
-        return outputs
-
-    def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name):
-        "resize the modules by rank pattern"
-        lora_config = self.peft_config[adapter_name]
-        for name, rank_idx in rank_pattern.items():
-            if isinstance(rank_idx, list):
-                rank = sum(rank_idx)
-                rank_idx = Tensor(rank_idx).view(-1)
-            elif isinstance(rank_idx, Tensor):
-                rank_idx = rank_idx.view(-1)
-                rank = rank_idx.sum().item()
-            else:
-                raise ValueError("Unexpected type of rank_idx")
-            key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
-            _, target, _ = _get_submodules(self.model, key)
-            lora_E_weights = target.lora_E[adapter_name][rank_idx]
-            lora_A_weights = target.lora_A[adapter_name][rank_idx]
-            lora_B_weights = target.lora_B[adapter_name][:, rank_idx]
-            ranknum = target.ranknum[adapter_name]
-            target.update_layer(
-                adapter_name,
-                rank,
-                lora_config.lora_alpha,
-                lora_config.lora_dropout,
-                lora_config.init_lora_weights,
-            )
-            if rank > 0:
-                target.lora_E.update({adapter_name: Parameter(lora_E_weights)})
-                target.lora_A.update({adapter_name: Parameter(lora_A_weights)})
-                target.lora_B.update({adapter_name: Parameter(lora_B_weights)})
-                # The scaling is exactly as the previous
-                target.ranknum.update({adapter_name: Parameter(ranknum)})
-
-    def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name):
-        "resize the state_dict by rank pattern"
-        for name, rank_idx in rank_pattern.items():
-            rank = sum(rank_idx)
-            prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
-            for layer in ["lora_E", "lora_A", "lora_B"]:
-                key = f"base_model.model.{prefix}.{layer}.{adapter_name}"
-                if layer != "lora_B":
-                    if rank != state_dict[key][2].reshape(state_dict[key][0]).shape[0]:
-                        dims = []
-                        data = state_dict[key][2].reshape(state_dict[key][0])
-                        data = data[rank_idx]
-                        state_dict[key][2] = data.reshape(-1)
-                        for dim in data.shape:
-                            dims.append(dim)
-                        state_dict[key][0] = dims
-                else:
-                    if rank != state_dict[key][2].reshape(state_dict[key][0]).shape[1]:
-                        dims = []
-                        data = state_dict[key][2].reshape(state_dict[key][0])
-                        data = data[:, rank_idx]
-                        state_dict[key][2] = data.reshape(-1)
-                        for dim in data.shape:
-                            dims.append(dim)
-                        state_dict[key][0] = dims
-        return state_dict
-
-    def update_and_allocate(self, global_step):
-        """
-        This method updates Adalora budget and mask.
-
-        This should be called in every training step after `loss.backward()` and before `zero_grad()`.
-
-        `tinit`, `tfinal` and `deltaT` are handled with in the method.
-
-        Args:
-            global_step (`int`): The current training step, it is used to calculate adalora budget.
-
-        Example:
-
-        ```python
-        >>> loss = model(**input).loss
-        >>> loss.backward()
-        >>> optimizer.step()
-        >>> model.base_model.update_and_allocate(i_step)
-        >>> optimizer.zero_grad()
-        ```
-        """
-        lora_config = self.peft_config[self.trainable_adapter_name]
-        # Update the importance score and allocate the budget
-        if global_step < lora_config.total_step - lora_config.tfinal:
-            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step)
-            if rank_pattern:
-                lora_config.rank_pattern = rank_pattern
-        # Finalize the budget allocation
-        elif global_step == lora_config.total_step - lora_config.tfinal:
-            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True)
-            # for some reason, this freezes the trainable parameters and nothing gets updates
-            # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name)
-            lora_config.rank_pattern = rank_pattern
-            self.rankallocator.reset_ipt()
-        # Currently using inefficient way to mask the unimportant weights using the rank pattern
-        #  due to problem mentioned above
-        elif global_step > lora_config.total_step - lora_config.tfinal:
-            self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern)
-        # Pass the function and do forward propagation
-        else:
-            return None
diff --git a/mindnlp/peft/tuners/adaption_prompt/__init__.py b/mindnlp/peft/tuners/adaption_prompt/__init__.py
deleted file mode 100644
index 9455bf891..000000000
--- a/mindnlp/peft/tuners/adaption_prompt/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""adaption_prompt"""
-
-from .config import AdaptionPromptConfig
-from .layer import AdaptedAttention
-from .model import AdaptionPromptModel
-
-
-__all__ = ["AdaptionPromptConfig", "AdaptedAttention", "AdaptionPromptModel"]
diff --git a/mindnlp/peft/tuners/adaption_prompt/config.py b/mindnlp/peft/tuners/adaption_prompt/config.py
deleted file mode 100644
index bb624c031..000000000
--- a/mindnlp/peft/tuners/adaption_prompt/config.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=C0301
-"""Module for defining and configuring the Adaption Prompt in PEFT framework."""
-from collections import namedtuple
-from dataclasses import dataclass, field
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.utils import PeftType
-from .utils import llama_compute_query_states
-
-@dataclass
-class AdaptionPromptConfig(PeftConfig):
-    """Stores the configuration of an [`AdaptionPromptModel`]."""
-    target_modules: str = field(
-        default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."}
-    )
-    adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"})
-    adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
-
-    def __post_init__(self):
-        r"""
-        This method is called automatically after the initialization of an instance of the 'AdaptionPromptConfig' class.
-        
-        Args:
-            self: An instance of the 'AdaptionPromptConfig' class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        Description:
-        This method sets the 'peft_type' attribute of the 'AdaptionPromptConfig' instance to 'PeftType.ADAPTION_PROMPT'.
-        The 'peft_type' attribute represents the type of the adaption prompt configuration.
-        
-        Example:
-            config = AdaptionPromptConfig()
-            config.__post_init__()
-            print(config.peft_type)  # Output: PeftType.ADAPTION_PROMPT
-        """
-        self.peft_type = PeftType.ADAPTION_PROMPT
-
-    @property
-    def is_adaption_prompt(self) -> bool:
-        """Return True if this is an adaption prompt config."""
-        return True
-
-
-# Contains the config that is specific to a transformers model type.
-ModelTypeConfig = namedtuple(
-    "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"]
-)
-
-# Mapping of transformers model types to their specific configuration.
-TRANSFORMERS_MODEL_CONFIG = {
-    "llama": ModelTypeConfig(
-        compute_query_states=llama_compute_query_states,
-        target_modules="self_attn",
-        k_proj_layer="k_proj",
-        v_proj_layer="v_proj",
-        o_proj_layer="o_proj",
-    ),
-    "mistral": ModelTypeConfig(  # same as llama,
-        compute_query_states=llama_compute_query_states,
-        target_modules="self_attn",
-        k_proj_layer="k_proj",
-        v_proj_layer="v_proj",
-        o_proj_layer="o_proj",
-    ),
-}
-
-
-def prepare_config(
-    peft_config: AdaptionPromptConfig,
-    model,
-) -> AdaptionPromptConfig:
-    """Prepare the config based on the llama model type."""
-    if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG:
-        raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.")
-
-    model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type]
-
-    if peft_config.target_modules is None:
-        peft_config.target_modules = model_config.target_modules
-
-    return peft_config
diff --git a/mindnlp/peft/tuners/adaption_prompt/layer.py b/mindnlp/peft/tuners/adaption_prompt/layer.py
deleted file mode 100644
index 9cb878a3f..000000000
--- a/mindnlp/peft/tuners/adaption_prompt/layer.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""layer for adaption prompt tuners."""
-import math
-import numpy as np
-import mindspore
-from mindspore import Tensor
-from mindnlp.core.nn import Parameter
-from mindnlp.core import nn, ops
-from .config import TRANSFORMERS_MODEL_CONFIG
-
-
-class AdaptedAttention(nn.Module):
-    """This cell wraps a LLamaAttention cell and injects adaption prompts."""
-    def __init__(self, model_type: str, adapter_len: int, model):
-        """
-        Initialize object.
-
-        Args:
-            model_type: The transformer model type. This is used to retrieve the right method to
-                compute query states.
-            adapter_len: The length of the adaption prompt to insert.
-            model: The original transformer attention cell that is being wrapped.
-        """
-        super(AdaptedAttention, self).__init__()
-        self.model_type = model_type
-        self.model = model
-        self.adapter_len = adapter_len
-
-        # 正确的初始化和使用 Normal 初始化器
-        normal_values = np.random.normal(loc=0.0, scale=1.0, size=(adapter_len, self.model.hidden_size)).astype(
-            np.float32)
-        self.adaption_prompt = Parameter(Tensor(normal_values, dtype=mindspore.float32))
-
-        # 使用零初始化器初始化门控参数
-        zero_values = np.zeros((1,), dtype=np.float32)
-        self.adaption_gate = Parameter(Tensor(zero_values, dtype=mindspore.float32))
-
-    def forward(self, **kwargs):
-        """
-        Forward pass for the adapter which wraps the original LlamaAttention cell.
-        Args:
-            kwargs: See the original LlamaAttention cell.
-        """
-        if kwargs.get("output_attention", False):
-            raise NotImplementedError("output_attention is not currently supported.")
-
-        output, _, past_key_value = self.model(**kwargs)
-        bsz = output.shape[0]
-        q_len = output.shape[1]
-        embed_dim = output.shape[2]
-        k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer
-        v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer
-        o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer
-        factor = (
-                self.model.k_proj.in_features // self.model.k_proj.out_features
-        )
-
-        if k_proj_layer == v_proj_layer:
-            _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, axis=2)
-        else:
-            key = getattr(self.model, k_proj_layer)(self.adaption_prompt)
-            value = getattr(self.model, v_proj_layer)(self.adaption_prompt)
-
-        # Operations are similar to PyTorch but using MindSpore operations
-        adapter_k = key.view(1, self.adapter_len, (self.model.num_heads // factor), self.model.head_dim)
-        adapter_k = ops.tile(adapter_k, (bsz, 1, 1, 1))
-        adapter_k = ops.permute(adapter_k, (0, 2, 1, 3))
-
-        adapter_v = value.view(1, self.adapter_len, (self.model.num_heads // factor), self.model.head_dim)
-        adapter_v = ops.tile(adapter_v, (bsz, 1, 1, 1))
-        adapter_v = ops.permute(adapter_v, (0, 2, 1, 3))
-
-        # Repeat interleave functionality
-        adapter_k = ops.repeat_interleave(adapter_k, (1, factor, 1, 1))
-        adapter_v = ops.repeat_interleave(adapter_v, (1, factor, 1, 1))
-
-        # Recompute query states
-        compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states
-        query_states = compute_query_states(model=self.model, **kwargs)
-
-        previous_dtype = query_states.dtype
-
-        # Dot product and softmax operations
-        scores = ops.bmm(query_states, adapter_k.transpose(2, 3))
-        scores /= math.sqrt(self.model.head_dim)
-
-        softmax = nn.Softmax(dim=-1)
-        scores = softmax(scores).astype(mindspore.float32)  # upcasting to fp32
-        scores *= self.adaption_gate
-
-        adapter_output = ops.matmul(scores, adapter_v).swapaxes(1, 2).reshape(bsz, q_len, -1)
-
-        # Projection layer if exists
-        if o_proj_layer is not None:
-            adapter_output = getattr(self.model, o_proj_layer)(adapter_output)
-
-        # Combine outputs
-        output = output + adapter_output
-        output = output.astype(previous_dtype)  # restore dtype if necessary
-
-        return output, None, past_key_value
diff --git a/mindnlp/peft/tuners/adaption_prompt/model.py b/mindnlp/peft/tuners/adaption_prompt/model.py
deleted file mode 100644
index e80e82f1c..000000000
--- a/mindnlp/peft/tuners/adaption_prompt/model.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""model for adaption prompt tuners."""
-from typing import Dict, List
-
-from mindnlp.core import nn
-
-from mindnlp.peft.utils import _freeze_adapter, _get_submodules
-
-from .config import AdaptionPromptConfig, prepare_config
-from .layer import AdaptedAttention
-from .utils import is_adaption_prompt_trainable
-
-
-class AdaptionPromptModel(nn.Module):
-    """
-    Implements adaption prompts as described in https://arxiv.org/pdf/2303.16199.pdf.
-
-    The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert
-    trainable prompts with gates (for zero init).
-
-    Notes on the multi-adapter pattern:
-    - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter
-      name.
-    - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them
-      in the dictionary, and replace them with the modules of the new adapter.
-    - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the
-      dictionary.
-    - Disabling the adapter would also result in the modules being removed from the model.
-    """
-    def __init__(self, model, configs: Dict, adapter_name: str):
-        r"""
-        Initializes an instance of the AdaptionPromptModel class.
-        
-        Args:
-            self: The current instance of the class.
-            model: The underlying model to be used for adaption prompts. Expected to be an object of a specific model class.
-            configs: A dictionary containing configuration details for the adaption prompt model.
-                - Type: Dict
-                - Purpose: Specifies various configurations required for the adaption prompt model.
-                - Restrictions: None
-            adapter_name: The name of the adapter to be added.
-                - Type: str
-                - Purpose: Identifies the adapter which needs to be added to the adaption prompt model.
-                - Restrictions: None
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        """
-        super(AdaptionPromptModel, self).__init__()
-        self.model = model
-        self.peft_config = {}
-        self._parents = {}
-        self._cached_adapters = {}
-        self._active_adapter = None
-        self._enabled = True
-        self.forward = self.model.forward
-        self.add_adapter(adapter_name, configs[adapter_name])
-        self._mark_only_adaption_prompts_as_trainable(self.model)
-
-    def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None:
-        """Add an adapter with the given name and config."""
-        config = prepare_config(config, self.model)
-        if adapter_name in self.peft_config:
-            raise ValueError(f"Adapter named '{adapter_name}' already exists.")
-
-        parents = []
-        # 获取模型的所有子模块及其名称
-        for name, submodule in self.model.named_modules().items():
-            if name.endswith(config.target_modules):
-                # 对每个符合条件的子模块调用 _get_submodules 函数
-                parent, target, target_name = _get_submodules(self.model, name)
-                if target == submodule:
-                    parents.append(parent)
-
-        if len(parents) < config.adapter_layers:
-            raise ValueError("Config specifies more adapter layers than available in the model.")
-
-        parents = parents[-config.adapter_layers:]
-        self._parents[adapter_name] = parents
-
-        if self._active_adapter and self._enabled:
-            self._remove_adapted_attentions(self._active_adapter)
-        self._active_adapter = adapter_name
-        self.peft_config[adapter_name] = config
-        self._create_adapted_attentions(config, parents)
-        if not self._enabled:
-            self._remove_adapted_attentions(adapter_name)
-
-        if config.inference_mode:
-            _freeze_adapter(self.model, adapter_name)
-
-    def set_adapter(self, adapter_name: str) -> None:
-        """Set the model to use the adapter with the given name."""
-        if self._active_adapter == adapter_name:
-            return
-        if adapter_name not in self.peft_config:
-            raise ValueError(f"Adapter with name '{adapter_name}' does not exist.")
-
-        if self._enabled:
-            self._remove_adapted_attentions(self._active_adapter)
-            self._set_adapted_attentions(adapter_name)
-
-        self._active_adapter = adapter_name
-
-    def enable_adapter_layers(self):
-        """Enable adapter layers by swapping in cached AdaptedAttention modules."""
-        self._enabled = True
-        self._set_adapted_attentions(self._active_adapter)
-
-    def disable_adapter_layers(self):
-        """Disable adapter layers by swapping out AdaptedAttention modules."""
-        self._enabled = False
-        self._remove_adapted_attentions(self._active_adapter)
-
-    def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: List[nn.Module]) -> None:
-        """Wrap LlamaAttention modules with newly created AdaptedAttention modules."""
-        for par in parents:
-            attn = AdaptedAttention(
-                model_type=self.model.config.model_type,
-                adapter_len=config.adapter_len,
-                model=getattr(par, config.target_modules),
-            )
-            setattr(par, config.target_modules, attn)
-
-    def _set_adapted_attentions(self, adapter_name: str) -> None:
-        """Replace LlamaAttention modules with cached AdaptedAttention modules."""
-        cached = self._cached_adapters[adapter_name]
-        del self._cached_adapters[adapter_name]
-        config = self.peft_config[adapter_name]
-        for i, par in enumerate(self._parents[adapter_name]):
-            setattr(par, config.target_modules, cached[i])
-
-    def _remove_adapted_attentions(self, adapter_name: str) -> None:
-        """Remove AdaptedAttention modules from the model and store them in the cache."""
-        config = self.peft_config[adapter_name]
-        adapted_attentions = []
-        for par in self._parents[adapter_name]:
-            attn = getattr(par, config.target_modules)
-            adapted_attentions.append(attn)
-            setattr(par, config.target_modules, attn.model)
-        self._cached_adapters[adapter_name] = adapted_attentions
-
-    def _mark_only_adaption_prompts_as_trainable(self, model: nn.Module) -> None:
-        r"""Marks only adaption prompts as trainable in the given model.
-        
-        Args:
-            self (AdaptionPromptModel): The instance of AdaptionPromptModel class.
-            model (nn.Module): The model for which adaption prompts need to be marked as trainable.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        for param in model.trainable_params():
-            if not is_adaption_prompt_trainable(param.name):
-                param.requires_grad = False
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            # This is necessary as e.g. causal models have various methods that we
-            # don't want to re-implement here.
-            return getattr(self.model, name)
diff --git a/mindnlp/peft/tuners/adaption_prompt/utils.py b/mindnlp/peft/tuners/adaption_prompt/utils.py
deleted file mode 100644
index cdcb71645..000000000
--- a/mindnlp/peft/tuners/adaption_prompt/utils.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utility functions for adaption prompt tuners."""
-import inspect
-from mindspore import Tensor
-from mindnlp.core import nn, ops
-
-
-def llama_rotate_half(x: Tensor) -> Tensor:
-    """
-    Rotate half the hidden dims of the input.
-
-    This function was duplicated verbatim from:
-    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126
-
-    This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other
-    functions were also adapted from the transformers implementation but were modified.
-    """
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return ops.cat((-x2, x1), -1)
-
-
-def llama_apply_rotary_pos_emb(q, cos, sin, position_ids):
-    """
-    Apply rotary position embedding to query states in the Llama model using MindSpore.
-    """
-    if cos.ndim == 4:
-        gather_indices = ops.broadcast_to(position_ids[:, None, :, None], (1, cos.shape[1], 1, cos.shape[3]))
-        cos = ops.gather_elements(cos, 2, gather_indices)
-        sin = ops.gather_elements(sin, 2, gather_indices)
-    else:
-        cos = cos[position_ids].unsqueeze(1)
-        sin = sin[position_ids].unsqueeze(1)
-    q_embed = (q * cos) + (llama_rotate_half(q) * sin)
-    return q_embed
-
-
-def llama_compute_query_states(model: nn.Module, **kwargs) -> Tensor:
-    """
-    Computes query states for a neural network model.
-    
-    Args:
-        model (nn.Module): The neural network model for which query states are computed.
-    
-    Returns:
-        Tensor: The computed query states.
-    
-    Raises:
-        ValueError: If the input parameters do not meet the required constraints.
-    """
-    hidden_states = kwargs.get("hidden_states")
-    position_ids = kwargs.get("position_ids")
-    past_key_value = kwargs.get("past_key_value")
-    bsz, q_len, _ = hidden_states.shape
-    query_states = ops.permute(model.q_proj(hidden_states).reshape(bsz, q_len, model.num_heads, model.head_dim), (0, 2, 1, 3))
-
-    factor = model.k_proj.in_features // model.k_proj.out_features
-    value_states = ops.permute(model.v_proj(hidden_states).reshape(bsz, q_len, (model.num_heads // factor), model.head_dim), (0, 2, 1, 3))
-
-    seq_len = q_len
-    if past_key_value is not None:
-        if isinstance(past_key_value, tuple):
-            seq_len += past_key_value[0].shape[-2]
-        else:
-            seq_len += past_key_value.get_seq_length(model.layer_idx)
-
-    if "position_ids" not in inspect.signature(model.rotary_emb).parameters:
-        cos, sin = model.rotary_emb(value_states, seq_len=seq_len)
-        return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids)
-
-    past_seen_tokens = 0
-    if position_ids is None:
-        if past_key_value is None:
-            new_cache_positions = Tensor(ops.arange(q_len, q_len + q_len))
-        else:
-            past_seen_tokens = past_key_value.get_usable_length(q_len, model.layer_idx)
-            new_cache_positions = Tensor(ops.arange(past_seen_tokens, past_seen_tokens + q_len))
-        position_ids = ops.unsqueeze(new_cache_positions, 0)
-
-    rotary_emb_kwargs = {"position_ids": position_ids}
-    if "seq_len" in inspect.signature(model.rotary_emb).parameters:
-        rotary_emb_kwargs["seq_len"] = seq_len
-
-    cos, sin = model.rotary_emb(value_states, **rotary_emb_kwargs)
-    if cos.shape[0] == 3:
-        cos = ops.unsqueeze(cos, 1)
-        sin = ops.unsqueeze(sin, 1)
-
-    return (query_states * cos) + (llama_rotate_half(query_states) * sin)
-
-
-def is_adaption_prompt_trainable(params: str) -> bool:
-    """Return True if cell is trainable under adaption prompt fine-tuning."""
-    return params.split(".")[-1].startswith("adaption_")
diff --git a/mindnlp/peft/tuners/ia3/__init__.py b/mindnlp/peft/tuners/ia3/__init__.py
deleted file mode 100644
index 99029cdb3..000000000
--- a/mindnlp/peft/tuners/ia3/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""IA3"""
-
-from .config import IA3Config
-from .layer import Conv2d, IA3Layer, Linear
-from .model import IA3Model
-
-
-__all__ = ["Conv2d", "IA3Config", "IA3Layer", "IA3Model", "Linear"]
diff --git a/mindnlp/peft/tuners/ia3/config.py b/mindnlp/peft/tuners/ia3/config.py
deleted file mode 100644
index adaf19b1e..000000000
--- a/mindnlp/peft/tuners/ia3/config.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=C0301
-"IA3 Config"
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.utils import PeftType
-
-
-@dataclass
-class IA3Config(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a [`IA3Model`].
-
-    Args:
-        target_modules (`Optional[Union[List[str], str]]`):
-            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
-            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
-            strings, either an exact match will be performed or it is checked if the name of the cell ends with any
-            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
-            excluding the output layer. If this is not specified, modules will be chosen according to the model
-            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
-            the target modules manually.
-        feedforward_modules (`Optional[Union[List[str], str]]`):
-            The names of the modules to be treated as feedforward modules, as in the original paper. These modules will
-            have (IA)³ vectors multiplied to the input, instead of the output. `feedforward_modules` must be a name or
-            a subset of names present in `target_modules`.
-        fan_in_fan_out (`bool`):
-            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
-            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
-        modules_to_save (`Optional[List[str]]`):
-            List of modules apart from (IA)³ layers to be set as trainable and saved in the final checkpoint.
-        init_ia3_weights (`bool`):
-            Whether to initialize the vectors in the (IA)³ layers, defaults to `True`. Setting this to `False` is
-            discouraged.
-    """
-    target_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": (
-                "List of cell names or regex expression of the cell names to replace with (IA)³."
-                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'."
-                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
-                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
-                "not known, an error will be raised -- in this case, you should specify the target modules manually."
-            ),
-        },
-    )
-    feedforward_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of cell names or a regex expression of cell names which are feedforward"
-            "For example, ['output.dense']"
-        },
-    )
-    fan_in_fan_out: bool = field(
-        default=False,
-        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
-    )
-    modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-    init_ia3_weights: bool = field(
-        default=True,
-        metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."},
-    )
-
-    def __post_init__(self):
-        r"""
-        This method initializes the IA3Config class after its instance has been created.
-        
-        Args:
-            self: An instance of the IA3Config class.
-        
-        Returns:
-            None.
-        
-        Raises:
-            ValueError: If the `feedforward_modules` parameter is not a subset of the `target_modules` parameter.
-        
-        Description:
-            The __post_init__ method sets default values for the IA3Config instance. It assigns the PeftType.IA3 value to the
-            peft_type attribute. The target_modules and feedforward_modules attributes are converted to sets if they are provided as
-            lists, or left unchanged if they are already sets.
-        
-            The method then performs a check to ensure that if both target_modules and feedforward_modules are sets, the
-            feedforward_modules subset is a subset of the target_modules set. If this check fails, a ValueError exception is raised
-            with the message '`feedforward_modules` should be a subset of `target_modules`'.
-        """
-        self.peft_type = PeftType.IA3
-        self.target_modules = (
-            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
-        )
-        self.feedforward_modules = (
-            set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules
-        )
-
-        # check if feedforward_modules is a subset of target_modules. run the check only if both are sets
-        if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set):
-            if not self.feedforward_modules.issubset(self.target_modules):
-                raise ValueError("`feedforward_modules` should be a subset of `target_modules`")
diff --git a/mindnlp/peft/tuners/ia3/layer.py b/mindnlp/peft/tuners/ia3/layer.py
deleted file mode 100644
index fdc400ba9..000000000
--- a/mindnlp/peft/tuners/ia3/layer.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""ia3 layer"""
-import warnings
-from typing import Any, List, Optional
-
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.transformers.ms_utils import Conv1D
-
-from ..tuners_utils import BaseTunerLayer, check_adapters_to_merge
-from ...utils import transpose
-
-
-class IA3Layer(BaseTunerLayer):
-    # All names of layers that may contain adapter weights
-    adapter_layer_names = ("ia3_l",)
-
-    def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None:
-        self.base_layer = base_layer
-        self.ia3_l = nn.ParameterDict({})
-        # Mark the weight as unmerged
-        self._disable_adapters = False
-        self.merged_adapters = []
-        self.is_feedforward = is_feedforward
-
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif isinstance(base_layer, (nn.Conv2d, nn.Conv3d)):
-            in_features, out_features = base_layer.in_channels, base_layer.out_channels
-        elif isinstance(base_layer, nn.Embedding):
-            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
-        elif isinstance(base_layer, Conv1D):
-            in_features, out_features = (
-                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
-            )
-        else:
-            raise ValueError(f"Unsupported layer type {type(base_layer)}")
-        self.in_features = in_features
-        self.out_features = out_features
-
-    def update_layer(self, adapter_name, init_ia3_weights):
-        # This code works for linear layers, override for other layer types
-        # Actual trainable parameters
-        if self.is_feedforward:
-            weight = ops.randn((1, self.in_features))
-        else:
-            weight = ops.randn((self.out_features, 1))
-        self.ia3_l[adapter_name] = nn.Parameter(weight)
-        if init_ia3_weights:
-            self.reset_ia3_parameters(adapter_name)
-        self.set_adapter(self.active_adapters)
-
-    def reset_ia3_parameters(self, adapter_name):
-        if adapter_name in self.ia3_l.keys():
-            # initialize learned vector with ops.ones
-            nn.init.constant_(self.ia3_l[adapter_name], 1.0)
-
-
-class Linear(nn.Module, IA3Layer):
-    # (IA)^3 implemented in a dense layer
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str,
-        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
-        is_target_conv_1d_layer: bool = False,  # whether target module is a conv1d layer. useful while unloading later
-        init_ia3_weights: bool = True,  # whether to initialize IA3 weights
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
-        self.fan_in_fan_out = fan_in_fan_out
-        self.is_target_conv_1d_layer = is_target_conv_1d_layer
-        self._active_adapter = adapter_name
-        self.update_layer(adapter_name, init_ia3_weights)
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self.ia3_l.keys():
-                base_layer = self.get_base_layer()
-                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out)
-                orig_dtype = base_layer.weight.data.dtype
-                if safe_merge:
-                    orig_weights = base_layer.weight.data
-                    orig_weights = ops.mul(orig_weights, ia3_l)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-                    base_layer.weight.data = orig_weights.to(orig_dtype)
-                else:
-                    base_layer.weight.data = ops.mul(base_layer.weight.data, ia3_l).to(orig_dtype)
-
-                if not self.is_feedforward and (base_layer.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    orig_dtype = base_layer.bias.data.dtype
-                    base_layer.bias.data = ops.mul(base_layer.bias.data, scaling.data).to(orig_dtype)
-
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-
-        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.ia3_l.keys():
-                base_layer = self.get_base_layer()
-                # Add tolerace to avoid division by zero
-                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8
-                orig_dtype = base_layer.weight.data.dtype
-                base_layer.weight.data = ops.div(base_layer.weight.data, ia3_l).to(orig_dtype)
-
-                if not self.is_feedforward and (base_layer.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    orig_dtype = base_layer.bias.data.dtype
-                    base_layer.bias.data = ops.div(base_layer.bias.data, scaling.data + 1e-8).to(orig_dtype)
-
-    def forward(self, x: mindspore.Tensor, *args: Any, **kwargs: Any) -> mindspore.Tensor:
-        dtype = previous_dtype = x.dtype
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            ia3_scaling = 1
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.ia3_l.keys():
-                    continue
-                dtype = self.ia3_l[active_adapter].dtype
-                ia3_scaling *= self.ia3_l[active_adapter].flatten()
-
-            if self.is_feedforward:
-                x = x.to(dtype)
-                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
-                # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(previous_dtype)
-                result = self.base_layer(interm, *args, **kwargs)
-            else:
-                result = self.base_layer(x, *args, **kwargs)
-                result_dtype = result.dtype
-                result = (result * ia3_scaling).to(result_dtype)
-
-        return result
-
-
-class _ConvNd(nn.Module, IA3Layer):
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str,
-        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
-        init_ia3_weights: bool = True,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
-        self.fan_in_fan_out = fan_in_fan_out
-        self._active_adapter = adapter_name
-        self._kernel_dim = base_layer.weight.dim()
-
-        self.update_layer(adapter_name, init_ia3_weights)
-
-    def update_layer(self, adapter_name, init_ia3_weights):
-        # Actual trainable parameters
-        num_features = self.in_features if self.is_feedforward else self.out_features
-        weights_size = (1, num_features) + (1,) * (self._kernel_dim - 2)
-        weight = ops.randn(weights_size)
-        self.ia3_l[adapter_name] = nn.Parameter(weight)
-        if init_ia3_weights:
-            self.reset_ia3_parameters(adapter_name)
-        self.set_adapter(self.active_adapters)
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self.ia3_l.keys():
-                base_layer = self.get_base_layer()
-                ia3_scaling = self.ia3_l[active_adapter].data
-                if not self.is_feedforward:
-                    ia3_scaling = ops.transpose(ia3_scaling, 0, 1)
-
-                if safe_merge:
-                    output_weight = ops.mul(base_layer.weight.data, ia3_scaling).clone()
-
-                    if not ops.isfinite(output_weight).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = output_weight
-                else:
-                    base_layer.weight.data = ops.mul(base_layer.weight.data, ia3_scaling)
-
-                if not self.is_feedforward and (base_layer.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = ops.mul(base_layer.bias.data, scaling.data)
-
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-
-        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.ia3_l.keys():
-                base_layer = self.get_base_layer()
-                # divide by (IA)^3 vector. Add tolerace to avoid division by zero
-                ia3_scaling = self.ia3_l[active_adapter].data
-                if not self.is_feedforward:
-                    ia3_scaling = ops.transpose(ia3_scaling, 0, 1)
-                base_layer.weight.data = ops.div(base_layer.weight.data, ia3_scaling + 1e-8)
-
-                if not self.is_feedforward and (base_layer.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = ops.mul(base_layer.bias.data, scaling.data)
-
-    def forward(self, x: mindspore.Tensor, *args: Any, **kwargs: Any) -> mindspore.Tensor:
-        dtype = previous_dtype = x.dtype
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            ia3_scaling = 1
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.ia3_l.keys():
-                    continue
-                dtype = self.ia3_l[active_adapter].dtype
-                ia3_scaling *= self.ia3_l[active_adapter]
-
-            if self.is_feedforward:
-                x = x.to(dtype)
-                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
-                # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
-                result = self.base_layer(interm, *args, **kwargs)
-            else:
-                result = self.base_layer(x, *args, **kwargs)
-                result = result.to(dtype) * ia3_scaling
-
-        result = result.to(previous_dtype)
-        return result
-
-
-class Conv2d(_ConvNd):
-    # IA3 implemented in a 2D convolutional layer
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if not self._kernel_dim == 4:
-            raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}")
-
-
-class Conv3d(_ConvNd):
-    # IA3 implemented in a 3D convolutional layer
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if not self._kernel_dim == 5:
-            raise ValueError(f"Conv2d layer kernel must have 5 dimensions, not {self._kernel_dim}")
diff --git a/mindnlp/peft/tuners/ia3/model.py b/mindnlp/peft/tuners/ia3/model.py
deleted file mode 100644
index 2d8f208e0..000000000
--- a/mindnlp/peft/tuners/ia3/model.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=arguments-differ
-# pylint: disable=arguments-renamed
-# pylint: disable=useless-parent-delegation
-# pylint: disable=line-too-long
-# pylint: disable=unused-variable
-# pylint: disable=unused-argument
-# pylint: disable=too-many-arguments
-"IA3 Model"
-from __future__ import annotations
-
-import re
-import warnings
-from dataclasses import asdict
-from enum import Enum
-from typing import Optional
-
-from mindnlp.core import nn
-
-from mindnlp.transformers.ms_utils import Conv1D
-from mindnlp.peft.utils import (
-    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-    _get_submodules,
-)
-from ..tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
-from .layer import Conv2d, IA3Layer, Linear
-
-
-class IA3Model(BaseTuner):
-    """
-    Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained
-    transformers model. The method is described in detail in https://arxiv.org/abs/2205.05638
-
-    Args:
-        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
-        config ([`IA3Config`]): The configuration of the (IA)^3 model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        IA3Model ([`mindspore.nn.Module`]): The IA3Lora model.
-
-    Example:
-
-        ```py
-        >>> from transformers import AutoModelForSeq2SeqLM, ia3Config
-        >>> from peft import IA3Model, IA3Config
-
-        >>> config = IA3Config(
-        ...     peft_type="IA3",
-        ...     task_type="SEQ_2_SEQ_LM",
-        ...     target_modules=["k", "v", "w0"],
-        ...     feedforward_modules=["w0"],
-        ... )
-
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-        >>> ia3_model = IA3Model(config, model)
-        ```
-    > **Attributes**:  
-
-    >   - **model** ([`transformers.PreTrainedModel`])— The model to be adapted. 
-
-    >   - **peft_config** ([`IA3Config`]): The configuration of the (IA)^3  model. 
-    """
-    prefix: str = "ia3_"
-
-    def __init__(self, model, config, adapter_name):
-        r"""
-        Initializes an instance of the IA3Model class.
-        
-        Args:
-            self: The instance of the IA3Model class.
-            model: The model object to be initialized.
-            config: The configuration settings for the model.
-            adapter_name: The name of the adapter.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        super().__init__(model, config, adapter_name)
-
-    @staticmethod
-    def _create_new_cell(ia3_config, adapter_name, target, **kwargs):
-        r"""
-        Creates a new cell based on the provided parameters.
-        
-        Args:
-            ia3_config (IA3Config): The configuration object for IA3Model.
-            adapter_name (str): The name of the adapter.
-            target (object): The target cell for which a new cell needs to be created.
-        
-        Returns:
-            None
-        
-        Raises:
-            ValueError: If the target cell is not supported. Only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported.
-            TypeError: If the target is not an instance of `BaseTunerLayer` or `nn.Conv2d`.
-            TypeError: If the target base layer is not an instance of `nn.Linear` or `Conv1D`.
-          
-        Note:
-            - The `loaded_in_8bit`, `loaded_in_4bit`, and `is_feedforward` parameters are optional and can be provided as keyword arguments.
-            - The `fan_in_fan_out` parameter is expected to be present in the `kwargs` dictionary.
-            - Depending on the type of `target` and `target_base_layer`, the appropriate cell (Conv2d or Linear) is created.
-            - If `target` is an instance of `BaseTunerLayer`, `target_base_layer` is obtained using `get_base_layer()` method.
-            - If `target` is `nn.Conv2d`, a new instance of `Conv2d` is created with the provided arguments.
-            - If `target_base_layer` is `nn.Linear`, a new instance of `Linear` is created with the provided arguments.
-            - If `target_base_layer` is `Conv1D`, a new instance of `Linear` is created with additional arguments indicating that the target is a Conv1D layer.
-            - The created cell is returned.
-        
-        """
-        # avoid eager bnb import
-        # if is_bnb_available():
-        #     import bitsandbytes as bnb
-
-        #     from .bnb import Linear8bitLt
-
-        # if is_bnb_4bit_available():
-        #     from .bnb import Linear4bit
-
-        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
-        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
-        is_feedforward = kwargs.pop("is_feedforward", False)
-
-        if isinstance(target, BaseTunerLayer):
-            target_base_layer = target.get_base_layer()
-        else:
-            target_base_layer = target
-
-        # if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
-        #     eightbit_kwargs = kwargs.copy()
-        #     eightbit_kwargs.update(
-        #         {
-        #             "has_fp16_weights": target_base_layer.state.has_fp16_weights,
-        #             "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
-        #             "threshold": target_base_layer.state.threshold,
-        #             "index": target_base_layer.index,
-        #         }
-        #     )
-        #     new_cell = Linear8bitLt(target, adapter_name, is_feedforward=is_feedforward, **eightbit_kwargs)
-        # elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit):
-        #     fourbit_kwargs = kwargs.copy()
-        #     fourbit_kwargs.update(
-        #         {
-        #             "compute_dtype": target_base_layer.compute_dtype,
-        #             "compress_statistics": target_base_layer.weight.compress_statistics,
-        #             "quant_type": target_base_layer.weight.quant_type,
-        #         }
-        #     )
-        #     new_cell = Linear4bit(target, adapter_name, is_feedforward=is_feedforward, **fourbit_kwargs)
-        if isinstance(target, nn.Conv2d):
-            new_cell = Conv2d(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
-        elif isinstance(target_base_layer, nn.Linear):
-            if kwargs["fan_in_fan_out"]:
-                warnings.warn(
-                    "fan_in_fan_out is set to True but the target cell is `torch.nn.Linear`. "
-                    "Setting fan_in_fan_out to False."
-                )
-                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
-            new_cell = Linear(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
-        elif isinstance(target_base_layer, Conv1D):
-            if not kwargs["fan_in_fan_out"]:
-                warnings.warn(
-                    "fan_in_fan_out is set to False but the target cell is `Conv1D`. "
-                    "Setting fan_in_fan_out to True."
-                )
-                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
-            new_cell = Linear(
-                target, adapter_name, is_feedforward=is_feedforward, is_target_conv_1d_layer=True, **kwargs
-            )
-        else:
-            raise ValueError(
-                f"Target cell {target} is not supported. "
-                f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported."
-            )
-        return new_cell
-
-    @staticmethod
-    def _check_target_module_exists(ia3_config, key):
-        r"""
-        Checks if the target cell exists in the IA3 configuration.
-        
-        Args:
-            ia3_config (dict): The IA3 configuration dictionary.
-                This dictionary contains the configuration information for the IA3Model.
-                The target cell is checked against this configuration.
-            key (str): The target cell key to be checked.
-                This key represents the target cell to be verified against the IA3 configuration.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return check_target_module_exists(ia3_config, key)
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        r"""
-        Marks only the adapters in the given model as trainable.
-        
-        Args:
-            self (IA3Model): The instance of the IA3Model class.
-            model (nn.Module): The model for which the adapters need to be marked as trainable.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        for name, param in model.parameters_and_names():
-            if self.prefix not in name:
-                param.requires_grad = False
-
-    def _create_and_replace(
-        self,
-        ia3_config,
-        adapter_name,
-        target,
-        target_name,
-        parent,
-        **optionnal_kwargs,
-    ):
-        r"""
-        Creates a new cell and replaces the target cell with it.
-        
-        Args:
-            self (IA3Model): The current instance of the IA3Model class.
-            ia3_config: The configuration settings for the IA3 model.
-            adapter_name: The name of the adapter.
-            target: The target cell to be replaced.
-            target_name: The name of the target cell.
-            parent: The parent cell of the target cell.
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        """
-        def _create_and_replace(self, ia3_config, adapter_name, target, target_name, parent):
-            """
-            Creates a new cell and replaces the target cell with it.
-        
-            Args:
-                self (IA3Model): The current instance of the IA3Model class.
-                ia3_config: The configuration settings for the IA3 model.
-                adapter_name: The name of the adapter.
-                target: The target cell to be replaced.
-                target_name: The name of the target cell.
-                parent: The parent cell of the target cell.
-        
-            Returns:
-                None
-        
-            Raises:
-                None
-            """
-            current_key = optionnal_kwargs.pop('current_key')
-            is_feedforward = self._check_target_module_feedforward(ia3_config, current_key)
-            kwargs = {'fan_in_fan_out': ia3_config.fan_in_fan_out, 'init_ia3_weights': ia3_config.init_ia3_weights, 'is_feedforward': is_feedforward}
-            kwargs['loaded_in_8bit'] = optionnal_kwargs.pop('loaded_in_8bit', False)
-            kwargs['loaded_in_4bit'] = optionnal_kwargs.pop('loaded_in_4bit', False)
-            if isinstance(target, IA3Layer):
-                target.update_layer(adapter_name, ia3_config.init_ia3_weights)
-            else:
-                new_cell = self._create_new_cell(ia3_config, adapter_name, target, **kwargs)
-                if adapter_name not in self.active_adapters:
-                    new_cell.requires_grad = False
-                self._replace_cell(parent, target_name, new_cell, target)
-        # check if target cell is in feedforward_modules
-        current_key = optionnal_kwargs.pop("current_key")
-        is_feedforward = self._check_target_module_feedforward(ia3_config, current_key)
-
-        kwargs = {
-            "fan_in_fan_out": ia3_config.fan_in_fan_out,
-            "init_ia3_weights": ia3_config.init_ia3_weights,
-            "is_feedforward": is_feedforward,
-        }
-        kwargs["loaded_in_8bit"] = optionnal_kwargs.pop("loaded_in_8bit", False)
-        kwargs["loaded_in_4bit"] = optionnal_kwargs.pop("loaded_in_4bit", False)
-        if isinstance(target, IA3Layer):
-            target.update_layer(
-                adapter_name,
-                ia3_config.init_ia3_weights,
-            )
-        else:
-            new_cell = self._create_new_cell(ia3_config, adapter_name, target, **kwargs)
-            if adapter_name not in self.active_adapters:
-                # adding an additional adapter: it is not automatically trainable
-                new_cell.requires_grad = False
-            self._replace_cell(parent, target_name, new_cell, target)
-
-    @staticmethod
-    def _check_target_module_feedforward(ia3_config, key) -> bool:
-        """
-        A helper private method that checks if the target cell `key` matches with a feedforward cell specified in
-        `ia3_config`
-        """
-        if isinstance(ia3_config.feedforward_modules, str):
-            is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key))
-        else:
-            is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
-        return is_feedforward
-
-    def _replace_cell(self, parent, child_name, new_cell, child):
-        r"""
-        Replaces a specified child object in the parent object with a new object.
-        
-        Args:
-            self (IA3Model): The instance of the IA3Model class.
-            parent: The parent object where the child object is located.
-            child_name: The name of the child object to be replaced.
-            new_cell: The new object that will replace the child object.
-            child: The child object to be replaced.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        Note:
-            This method replaces the child object in the parent object with the new object. If the child object has a 'base_layer'
-            attribute, the method updates the child object to be the 'base_layer'. If the new object does not have a 'base_layer'
-            attribute, the method copies the weight and bias attributes from the child object to the new object. If the child object
-            has a 'state' attribute, the method updates the 'state' attribute of the new object to match the child object's 'state'.
-        """
-        setattr(parent, child_name, new_cell)
-
-        # child layer wraps the original cell, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        # layers with base_layer don't need the weight to be copied, as they have a reference already
-        if not hasattr(new_cell, "base_layer"):
-            new_cell.weight = child.weight
-            if hasattr(child, "bias"):
-                new_cell.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def get_peft_config_as_dict(self, inference: bool = False):
-        """Get the configuration of the (IA)^3 model as a dictionary."""
-        config_dict = {}
-        for key, value in self.peft_config.items():
-            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
-            if inference:
-                config["inference_mode"] = True
-            config_dict[key] = config
-        return config
-
-    def _set_adapter_layers(self, enabled=True):
-        r"""
-        Method to set the adapter layers in the IA3Model.
-        
-        Args:
-            self (IA3Model): The instance of the IA3Model class.
-            enabled (bool, optional): A flag indicating whether to enable the adapter layers. Default is True.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            - TypeError: If the 'enabled' parameter is not a boolean.
-            - AttributeError: If the 'IA3Model' instance does not have a 'modules' method.
-            - ValueError: If the 'IA3Model' instance's modules include a cell that is not an IA3Layer or a ModulesToSaveWrapper.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, (IA3Layer, ModulesToSaveWrapper)):
-                cell.enable_adapters(enabled)
-
-    def enable_adapter_layers(self) -> None:
-        """Enable all adapters.
-
-        Call this if you have previously disabled all adapters and want to re-enable them.
-        """
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self) -> None:
-        """Disable all adapters.
-
-        When disabling all adapters, the model output corresponds to the output of the base model.
-        """
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name: str | list[str]) -> None:
-        """Set the active adapter(s).
-
-        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.named_parameters():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, IA3Layer):
-                if cell.merged:
-                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
-                    cell.unmerge()
-                cell.set_adapter(adapter_name)
-        self.active_adapter = adapter_name
-
-    def _prepare_adapter_config(self, peft_config, model_config):
-        r"""
-        Prepare the adapter configuration for the IA3Model.
-        
-        Args:
-            self (IA3Model): The instance of the IA3Model class.
-            peft_config (object): The configuration object for the adapter.
-            model_config (dict): The configuration dictionary for the model.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            ValueError: If `peft_config.target_modules` is None and `model_config['model_type']` is not found in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING.
-            ValueError: If `peft_config.feedforward_modules` is None and `model_config['model_type']` is not found in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING.
-        """
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
-        if peft_config.feedforward_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING:
-                raise ValueError("Please specify `feedforward_modules` in `peft_config`")
-            peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[
-                model_config["model_type"]
-            ]
-        return peft_config
-
-    def _unload_and_optionally_merge(
-        self, merge: bool = True, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
-    ):
-        r"""
-        This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
-        as a standalone model.
-
-        Args:
-            safe_merge (`bool`, `optional`, defaults to `False`):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        if getattr(self.model, "is_loaded_in_8bit", False):
-            raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode")
-
-        if getattr(self.model, "is_loaded_in_4bit", False):
-            raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode")
-
-        self._unloading_checks(adapter_names)
-        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
-        for key in key_list:
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-
-            if hasattr(target, "base_layer"):
-                if merge:
-                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-                self._replace_cell(parent, target_name, target.get_base_layer(), target)
-            elif isinstance(target, ModulesToSaveWrapper):
-                # save any additional trainable modules part of `modules_to_save`
-                new_cell = target.modules_to_save[target.active_adapter]
-                if hasattr(new_cell, "base_layer"):
-                    # check if the cell is itself a tuner layer
-                    if merge:
-                        new_cell.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-                    new_cell = new_cell.get_base_layer()
-                setattr(parent, target_name, new_cell)
-
-        return self.model
-
-    def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> nn.Module:
-        r"""
-        This method merges the IA³ layers into the base model. This is needed if someone wants to use the base model as
-        a standalone model.
-
-        Args:
-            safe_merge (`bool`):
-                whether to activate the safe merging check to check if there is any potential Nan in the adapter
-                weights
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-
-        Example:
-
-        ```py
-        >>> from transformers import AutoModelForCausalLM
-        >>> from peft import PeftModel
-
-        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
-        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
-        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
-        >>> merged_model = model.merge_and_unload()
-        ```
-        """
-        return self._unload_and_optionally_merge(safe_merge=safe_merge, adapter_names=adapter_names)
-
-    def unload(self) -> nn.Module:
-        """
-        Gets back the base model by removing all the IA³ modules without merging. This gives back the original base
-        model.
-        """
-        return self._unload_and_optionally_merge(merge=False)
-
-    def delete_adapter(self, adapter_name: str) -> None:
-        """
-        Deletes an existing adapter.
-
-        Args:
-            adapter_name (str): Name of the adapter to be deleted.
-        """
-        if adapter_name not in self.peft_config:
-            raise ValueError(f"Adapter {adapter_name} does not exist")
-        del self.peft_config[adapter_name]
-
-        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
-        new_adapter = None
-        for key in key_list:
-            _, target, _ = _get_submodules(self.model, key)
-            if isinstance(target, IA3Layer):
-                target.delete_adapter(adapter_name)
-                if new_adapter is None:
-                    new_adapter = target.active_adapters[:]
-
-        self.active_adapter = new_adapter or []
diff --git a/mindnlp/peft/tuners/ln_tuning/__init__.py b/mindnlp/peft/tuners/ln_tuning/__init__.py
deleted file mode 100644
index c5cada72e..000000000
--- a/mindnlp/peft/tuners/ln_tuning/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""ln_tuning."""
-from .config import LNTuningConfig
-from .model import LNTuningModel
-
-__all__ = ["LNTuningConfig", "LNTuningModel"]
diff --git a/mindnlp/peft/tuners/ln_tuning/config.py b/mindnlp/peft/tuners/ln_tuning/config.py
deleted file mode 100644
index 4d5f89b00..000000000
--- a/mindnlp/peft/tuners/ln_tuning/config.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""ln_tuning config"""
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Optional, Union
-
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.utils import PeftType
-
-
-@dataclass
-class LNTuningConfig(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a :class:`~peft.tuners.LNTuningModel`.
-
-    Args:
-        target_modules (`Optional[Union[List[str], str]]`):
-            List of cell names or regex expression of the cell names to replace with LNTuning. For example,
-            '.*decoder.*' or '.*encoder.*'. If this is not specified, modules will be chosen according to the model
-            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
-            the target modules manually.
-        modules_to_save (`Optional[Union[List[str], str]]`):
-            List of modules to be set as trainable and saved in the final checkpoint. For example, in Sequence
-            Classification or Token Classification tasks, the final layer `classifier/score` are randomly initialized
-            and as such need to be trainable and saved.
-    """
-
-    target_modules: Optional[Union[list[str], str]] = field(
-        default=None,
-        metadata={
-            "help": (
-                "List of cell names or regex expression of the cell names to replace with LNTuning."
-                "For example, '.*decoder.*' or '.*encoder.*'. "
-                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
-                "not known, an error will be raised -- in this case, you shoud specify the target modules manually."
-            ),
-        },
-    )
-    modules_to_save: Optional[Union[list[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.LN_TUNING
diff --git a/mindnlp/peft/tuners/ln_tuning/layer.py b/mindnlp/peft/tuners/ln_tuning/layer.py
deleted file mode 100644
index d3d88dd6f..000000000
--- a/mindnlp/peft/tuners/ln_tuning/layer.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""ln_tuning layer"""
-import warnings
-from copy import deepcopy
-from typing import List, Optional
-
-import mindspore
-from mindnlp.core import nn
-
-from mindnlp.peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
-
-
-class LNTuningLayer(nn.Module, BaseTunerLayer):
-    """
-    Selects a layer from the model.
-    """
-
-    adapter_layer_names = ("ln_tuning_layers",)
-
-    def __init__(self, base_layer: nn.Module, adapter_name: str):
-        super().__init__()
-        self.base_layer = base_layer
-        self.ln_tuning_layers = nn.ModuleDict({})
-        self.update_layer(self.base_layer, adapter_name)
-        self._active_adapter = adapter_name
-        self.merged_adapters = []
-
-    def update_layer(self, layer: nn.Module, adapter_name: str):
-        self.ln_tuning_layers[adapter_name] = deepcopy(layer)
-
-    def enable_adapters(self, enabled: bool) -> None:
-        """Toggle the enabling and disabling of adapters
-
-        Takes care of setting the requires_grad flag for the adapter weights.
-
-        Args:
-            enabled (bool): True to enable adapters, False to disable adapters
-        """
-        if enabled:
-            self.set_adapter(self.active_adapters)
-            self._disable_adapters = False
-        else:
-            if self.merged:
-                self.unmerge()
-            # disable grads on all adapter layers
-            for layer_name in self.adapter_layer_names:
-                layer = getattr(self, layer_name)
-                layer.requires_grad = False
-            self._disable_adapters = True
-
-    def merge(self, adapter_names: Optional[List[str]] = None):
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        if len(adapter_names) > 1:
-            raise ValueError(
-                f"Trying to merge {len(adapter_names)} adapters, but LN "
-                f"tuning does not allow merging more than one adapter at a time"
-            )
-        merged_adapters = set(self.merged_adapters)
-        if merged_adapters:
-            warnings.warn(f"Already merged with {merged_adapters}. Unmerging first.")
-            self.unmerge()
-
-        self.base_layer, self.ln_tuning_layers[adapter_names[0]] = (
-            self.ln_tuning_layers[adapter_names[0]],
-            self.base_layer,
-        )
-        self.merged_adapters.append(adapter_names[0])
-
-    def unmerge(self):
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        # popping one element is sufficient because LN
-        # tuning does not allow merging more than one adapter at a time.
-        merged_name = self.merged_adapters.pop()
-        self.base_layer, self.ln_tuning_layers[merged_name] = (
-            self.ln_tuning_layers[merged_name],
-            self.base_layer,
-        )
-
-    def forward(self, x: mindspore.Tensor, *args, **kwargs) -> mindspore.Tensor:
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            if len(self.active_adapters) != 1:
-                raise ValueError(
-                    f"Trying to run forward with {len(self.active_adapters)} active "
-                    f"adapters, but LN tuning does not allow inference with more than one adapter at a time"
-                )
-            active_adapter = self.active_adapters[0]
-            result = self.ln_tuning_layers[active_adapter](x, *args, **kwargs)
-
-        return result
-
-    def __repr__(self) -> str:
-        rep = super().__repr__()
-        return "ln_tuning." + rep
diff --git a/mindnlp/peft/tuners/ln_tuning/model.py b/mindnlp/peft/tuners/ln_tuning/model.py
deleted file mode 100644
index 8c1ac32a0..000000000
--- a/mindnlp/peft/tuners/ln_tuning/model.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""ln_tuning model"""
-from __future__ import annotations
-
-import warnings
-from typing import Optional
-
-from mindspore import nn
-from mindspore.nn.cell import Cell
-from tqdm import tqdm
-
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.tuners.tuners_utils import (
-    BaseTuner,
-    _get_submodules,
-    check_target_module_exists,
-)
-from mindnlp.peft.utils import (
-    TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-)
-
-from .layer import LNTuningLayer
-
-
-class LNTuningModel(BaseTuner):
-    """
-    Creates LayerNorm tuning from a pretrained transformer model.
-
-    The method is described in detail in https://arxiv.org/abs/2312.11420.
-
-    Args:
-        model ([`mindspore.nn.Module`]): The model to be adapted.
-        config ([`LNTuningConfig`]): The configuration of the Lora model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        'mindspore.nn.Module': The adapted model with LayerNorm tuned on.
-
-    Example:
-
-        ```py
-        >>> from mindnlp.transformers import AutoModelForCausalLM
-        >>> from mindnlp.peft import get_peft_model, TaskType, LNTuningConfig
-
-        >>> peft_config = LNTuningConfig(
-        ...     task_type=TaskType.CAUSAL_LM,
-        ... )
-
-        >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-        >>> model = get_peft_model(model, peft_config)
-        >>> model.print_trainable_parameters()
-        ```
-
-    **Attributes**:
-        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
-        - **peft_config** ([`LNTuningConfig`]): The configuration of the Lora model.
-    """
-
-    prefix: str = "ln_tuning_"
-
-    # def __init__(self, model, config, adapter_name) -> None:
-    #     # self.adapter_name = adapter_name
-    #     super().__init__(model, config, adapter_name)
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    # TODO: here need to handle the modules_to_save rather than the target_modules
-    @staticmethod
-    def _prepare_adapter_config(
-        peft_config: PeftConfig, model_config: dict
-    ) -> PeftConfig:
-        if peft_config.target_modules is None:
-            if (
-                model_config["model_type"]
-                not in TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING
-            ):
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = set(
-                TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING[
-                    model_config["model_type"]
-                ]
-            )
-        return peft_config
-
-    def _create_and_replace(
-        self,
-        peft_config: PeftConfig,
-        adapter_name: str,
-        target: Cell,
-        target_name: str,
-        parent: Cell,
-        current_key: str,
-    ) -> None:
-        # replace the original cell with a same new cell
-        new_cell = self._create_new_cell(peft_config, target, adapter_name)
-        if adapter_name != self.active_adapter:
-            new_cell.requires_grad = False
-        self._replace_module(parent, target_name, new_cell, target)
-
-    def _create_new_cell(
-        self,
-        peft_config: PeftConfig,
-        target: Cell,
-        adapter_name: str,
-    ) -> Cell:
-        if not isinstance(target, LNTuningLayer):
-            new_cell = LNTuningLayer(target, adapter_name)
-        else:
-            new_cell = target
-            new_cell.update_layer(target.base_layer, adapter_name)
-        return new_cell
-
-    def _replace_module(
-        self, parent: Cell, child_name: str, new_cell: Cell, child: Cell
-    ) -> None:
-        setattr(parent, child_name, new_cell)
-
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-
-    def _mark_only_adapters_as_trainable(self, model: Cell):
-        for n, p in model.parameters_and_names():
-            if self.prefix not in n:
-                p.requires_grad = False
-            else:
-                p.requires_grad = True
-
-    def _check_target_module_exists(self, peft_config: PeftConfig, key: str) -> bool:
-        return check_target_module_exists(peft_config, key)
-
-    def _set_adapter_layers(self, enabled: bool) -> None:
-        for cell in self.model.modules():
-            if isinstance(cell, (LNTuningLayer, ModulesToSaveWrapper)):
-                cell.enable_adapters(enabled)
-
-    def enable_adapter_layers(self) -> None:
-        """Enable all adapters.
-
-        Call this if you have previously disabled all adapters and want to re-enable them.
-        """
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self) -> None:
-        """Disable all adapters.
-
-        When disabling all adapters, the model output corresponds to the output of the base model.
-        """
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name: str) -> None:
-        for cell in self.model.modules():
-            if isinstance(cell, LNTuningLayer):
-                if cell.merged:
-                    warnings.warn(
-                        "Adapter cannot be set when the model is merged. Unmerging the model first."
-                    )
-                    cell.unmerge()
-                cell.set_adapter(adapter_name)
-        self.active_adapter = adapter_name
-
-    def _unload_and_optionally_merge(
-        self,
-        merge=True,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ):
-        self._unloading_checks(adapter_names)
-        key_list = [
-            key for key, _ in self.model.named_modules() if self.prefix not in key
-        ]
-        desc = "Unloading adapters " + ("and merging " if merge else "") + "model"
-
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-
-            if hasattr(target, "base_layer"):
-                if merge:
-                    target.merge(adapter_names)
-                self._replace_module(
-                    parent, target_name, target.get_base_layer(), target
-                )
-
-        return self.model
-
-    def unload(self):
-        return self._unload_and_optionally_merge(merge=False)
-
-    def merge_and_unload(
-        self,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ) -> nn.Module:
-        return self._unload_and_optionally_merge(merge=True)
diff --git a/mindnlp/peft/tuners/loha/__init__.py b/mindnlp/peft/tuners/loha/__init__.py
deleted file mode 100644
index 4f7fe36fb..000000000
--- a/mindnlp/peft/tuners/loha/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""loha."""
-
-from .config import LoHaConfig
-from .layer import Conv2d, Linear, LoHaLayer
-from .model import LoHaModel
-
-__all__ = ["LoHaConfig", "LoHaModel", "Conv2d", "Linear", "LoHaLayer"]
diff --git a/mindnlp/peft/tuners/loha/config.py b/mindnlp/peft/tuners/loha/config.py
deleted file mode 100644
index f5e34ede0..000000000
--- a/mindnlp/peft/tuners/loha/config.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""loha config"""
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-
-from mindnlp.peft.tuners.lycoris_utils import LycorisConfig
-from mindnlp.peft.utils import PeftType
-
-
-@dataclass
-class LoHaConfig(LycorisConfig):
-    """
-    This is the configuration class to store the configuration of a [`LoHaModel`].
-
-    Args:
-        r (`int`):
-            LoHa rank.
-        alpha (`int`):
-            The alpha parameter for LoHa scaling.
-        rank_dropout (`float`):
-            The dropout probability for rank dimension during training.
-        module_dropout (`float`):
-            The dropout probability for disabling LoHa modules during training.
-        use_effective_conv2d (`bool`):
-            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
-        target_modules (`Optional[Union[List[str], str]]`):
-            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
-            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
-            strings, either an exact match will be performed or it is checked if the name of the module ends with any
-            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
-            excluding the output layer. If this is not specified, modules will be chosen according to the model
-            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
-            the target modules manually.
-        init_weights (`bool`):
-            Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
-            discouraged.
-        layers_to_transform (`Union[List[int], int]`):
-            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
-            that are specified in this list. If a single integer is passed, it will apply the transformations on the
-            layer at this index.
-        layers_pattern (`str`):
-            The layer pattern name, used only if `layers_to_transform` is different from `None`.
-        rank_pattern (`dict`):
-            The mapping from layer names or regexp expression to ranks which are different from the default rank
-            specified by `r`.
-        alpha_pattern (`dict`):
-            The mapping from layer names or regexp expression to alphas which are different from the default alpha
-            specified by `alpha`.
-        modules_to_save (`Optional[List[str]]`):
-            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
-    """
-    r: int = field(default=8, metadata={"help": "LoHa rank"})
-    alpha: int = field(default=8, metadata={"help": "LoHa alpha"})
-    rank_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for rank dimension during training"},
-    )
-    module_dropout: float = field(
-        default=0.0,
-        metadata={
-            "help": "The dropout probability for disabling LoHa modules during training"
-        },
-    )
-    use_effective_conv2d: bool = field(
-        default=False,
-        metadata={
-            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
-        },
-    )
-    target_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of cell names or regex expression of the module names to replace with LoHa."
-            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
-            "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
-        },
-    )
-    init_weights: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change "
-                "this setting, except if you know exactly what you're doing."
-            ),
-        },
-    )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
-        default=None,
-        metadata={
-            "help": "The layer indexes to transform, is this argument is specified,"
-            "PEFT will transform only the layers indexes that are specified inside this list. "
-            "If a single integer is passed, PEFT will transform only the layer at this index."
-        },
-    )
-    layers_pattern: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
-        },
-    )
-    modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-
-    def __post_init__(self):
-        r"""
-        This method initializes the attributes of a LoHaConfig object after its creation.
-        
-        Args:
-            self: The instance of the LoHaConfig class.
-                Type: LoHaConfig
-                Purpose: Represents the current instance of the LoHaConfig class.
-                Restrictions: This parameter is required and should always be the first parameter of the method.
-        
-        Returns:
-            None: This method does not return any value.
-                Type: None
-                Purpose: The method sets the 'peft_type' attribute to PeftType.LOHA and initializes the 'target_modules' attribute
-                as a set if the 'target_modules' attribute is of type list; otherwise, it leaves the 'target_modules' attribute unchanged.
-        
-        Raises:
-            None
-        """
-        self.peft_type = PeftType.LOHA
-        self.target_modules = (
-            set(self.target_modules)
-            if isinstance(self.target_modules, list)
-            else self.target_modules
-        )
diff --git a/mindnlp/peft/tuners/loha/layer.py b/mindnlp/peft/tuners/loha/layer.py
deleted file mode 100644
index ef527b312..000000000
--- a/mindnlp/peft/tuners/loha/layer.py
+++ /dev/null
@@ -1,756 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""loha layer"""
-import math
-from typing import Any, Set, Tuple
-
-import mindspore
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import ParameterDict, Parameter
-
-from mindnlp.peft.tuners.lycoris_utils import LycorisLayer
-
-
-class LoHaLayer(nn.Module, LycorisLayer):
-
-    r"""
-    The LoHaLayer class represents a layer that applies Local Harmonic Adaptation (LoHa) to a base layer. LoHaLayer inherits from nn.Module and LycorisLayer. It provides methods to create, reset, and update
-adapter parameters, as well as to calculate delta weights and apply the adaptation to input data. 
-    
-    Attributes:
-        base_layer (nn.Module): The base layer for which LoHa adaptation is applied.
-    
-    Methods:
-        - create_adapter_parameters(adapter_name, r, shape): Creates adapter parameters for the specified adapter name, rank, and shape.
-        - reset_adapter_parameters(adapter_name): Resets adapter parameters for the specified adapter name with initialized weights.
-        - reset_adapter_parameters_random(adapter_name): Resets adapter parameters for the specified adapter name with random weights.
-        - update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs): Updates the layer with a new adapter using the specified parameters.
-        - get_delta_weight(adapter_name): Retrieves the delta weight for the specified adapter name.
-        - forward(x, *args, **kwargs): Constructs the layer by applying the base layer and active adapters to the input data.
-    
-    The class also provides internal functions for managing adapter parameters, updating the layer with new adapters, and applying the adaptation to the input data. 
-    
-    Note: Detailed parameter descriptions for each method are available in the method signatures in the source code.
-    """
-    # All names of layers that may contain adapter weights
-    adapter_layer_names = (
-        "hada_w1_a",
-        "hada_w1_b",
-        "hada_w2_a",
-        "hada_w2_b",
-        "hada_t1",
-        "hada_t2",
-    )
-    # other_param_names is defined on parent class
-
-    def __init__(self, base_layer: nn.Module):
-        r"""
-        Initializes the LoHaLayer class.
-        
-        Args:
-            self: The instance of the class.
-            base_layer (nn.Module): The base layer to be initialized with. It should be an instance of nn.Module.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            N/A
-        """
-        super().__init__()
-        LycorisLayer.__init__(self, base_layer)
-
-        # LoHa info
-        self.hada_w1_a = ParameterDict({})
-        self.hada_w1_b = ParameterDict({})
-        self.hada_w2_a = ParameterDict({})
-        self.hada_w2_b = ParameterDict({})
-        self.hada_t1 = ParameterDict({})
-        self.hada_t2 = ParameterDict({})
-
-    @property
-    def _available_adapters(self) -> Set[str]:
-        """
-        Method to retrieve the set of available adapters in the LoHaLayer class.
-        
-        Args:
-            self: Instance of the LoHaLayer class.
-        
-        Returns:
-            Returns a set of strings representing the available adapters in the LoHaLayer instance.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        return {
-            *self.hada_w1_a,
-            *self.hada_w1_b,
-            *self.hada_w2_a,
-            *self.hada_w2_b,
-            *self.hada_t1,
-            *self.hada_t2,
-        }
-
-    def create_adapter_parameters(
-        self, adapter_name: str, r: int, shape: Tuple[int, ...]
-    ):
-        r"""
-        This method creates adapter parameters for the LoHaLayer class.
-        
-        Args:
-            self: An instance of the LoHaLayer class.
-            adapter_name (str): The name of the adapter.
-            r (int): The value of 'r' parameter.
-            shape (Tuple[int, ...]): The shape of the parameters.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        '''
-        
-        The method's code is: 
-        def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]):
-            if len(shape) == 4:
-                self.hada_t1[adapter_name] = Parameter(ops.zeros(r, r, shape[2], shape[3]))
-                self.hada_w1_a[adapter_name] = Parameter(ops.zeros(r, shape[0]))
-                self.hada_w1_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-                self.hada_t2[adapter_name] = Parameter(ops.zeros(r, r, shape[2], shape[3]))
-                self.hada_w2_a[adapter_name] = Parameter(ops.zeros(r, shape[0]))
-                self.hada_w2_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-            else:
-                self.hada_w1_a[adapter_name] = Parameter(ops.zeros(shape[0], r))
-                self.hada_w1_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-                self.hada_w2_a[adapter_name] = Parameter(ops.zeros(shape[0], r))
-                self.hada_w2_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-        """
-        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75
-        if len(shape) == 4:
-            self.hada_t1[adapter_name] = Parameter(
-                ops.zeros(r, r, shape[2], shape[3])
-            )
-            self.hada_w1_a[adapter_name] = Parameter(
-                ops.zeros(r, shape[0])
-            )  # out_dim, 1-mode
-            self.hada_w1_b[adapter_name] = Parameter(
-                ops.zeros(r, shape[1])
-            )  # in_dim , 2-mode
-
-            self.hada_t2[adapter_name] = Parameter(
-                ops.zeros(r, r, shape[2], shape[3])
-            )
-            self.hada_w2_a[adapter_name] = Parameter(
-                ops.zeros(r, shape[0])
-            )  # out_dim, 1-mode
-
-            self.hada_w2_b[adapter_name] = Parameter(
-                ops.zeros(r, shape[1])
-            )  # in_dim , 2-mode
-
-        else:
-            self.hada_w1_a[adapter_name] = Parameter(ops.zeros(shape[0], r))
-            self.hada_w1_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-
-            self.hada_w2_a[adapter_name] = Parameter(ops.zeros(shape[0], r))
-            self.hada_w2_b[adapter_name] = Parameter(ops.zeros(r, shape[1]))
-
-    def reset_adapter_parameters(self, adapter_name: str):
-        # Original implementation performs initialization with normal distribution
-        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
-
-        # FedPara paper proposes to perform He initialization, let's stick with it
-        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
-        if adapter_name in self.hada_w1_a.keys():
-            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
-            nn.init.zeros_(self.hada_w2_b[adapter_name])
-        if adapter_name in self.hada_t1.keys():
-            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
-
-    def reset_adapter_parameters_random(self, adapter_name: str):
-        # Original implementation performs initialization with normal distribution
-        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
-
-        # FedPara paper proposes to perform He initialization, let's stick with it
-        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
-        if adapter_name in self.hada_w1_a.keys():
-            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_w2_b[adapter_name], a=math.sqrt(5))
-        if adapter_name in self.hada_t1.keys():
-            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
-            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
-
-    def update_layer(
-        self,
-        adapter_name: str,
-        r: int,
-        alpha: float,
-        rank_dropout: float,
-        module_dropout: float,
-        init_weights: bool,
-        use_effective_conv2d: bool = False,
-        **kwargs,
-    ) -> None:
-        """Internal function to create loha adapter
-
-        Args:
-            adapter_name (`str`): Name for the adapter to add.
-            r (`int`): Rank for the added adapter.
-            alpha (`float`): Alpha for the added adapter.
-            rank_dropout (`float`): The dropout probability for rank dimension during training.
-            module_dropout (`float`): The dropout probability for disabling adapter during training.
-            init_weights (`bool`): Whether to initialize weights.
-            use_effective_conv2d (`bool`, *optional*, defaults to `False`):
-                Use parameter effective decomposition for Conv2d with ksize > 1.
-        """
-        if r <= 0:
-            raise ValueError(
-                f"`r` should be a positive integer value but the value passed is {r}"
-            )
-
-        self.r[adapter_name] = r
-        self.alpha[adapter_name] = alpha
-        self.scaling[adapter_name] = alpha / r
-        self.rank_dropout[adapter_name] = rank_dropout
-        self.module_dropout[adapter_name] = module_dropout
-
-        # Determine shape of LoHa weights
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            shape = tuple(base_layer.weight.shape)
-        elif isinstance(base_layer, nn.Conv2d):
-            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (
-                1,
-                1,
-            )
-            if use_effective_conv2d:
-                shape = (
-                    base_layer.out_channels,
-                    base_layer.in_channels,
-                    *base_layer.kernel_size,
-                )
-            else:
-                shape = (
-                    base_layer.out_channels,
-                    base_layer.in_channels
-                    * base_layer.kernel_size[0]
-                    * base_layer.kernel_size[1],
-                )
-        else:
-            raise TypeError(
-                f"LoHa is not implemented for base layers of type {type(base_layer).__name__}"
-            )
-
-        # Create weights with provided shape
-        self.create_adapter_parameters(adapter_name, r, shape)
-
-        # Initialize weights
-        if init_weights:
-            self.reset_adapter_parameters(adapter_name)
-        else:
-            self.reset_adapter_parameters_random(adapter_name)
-        # TODO
-        self.set_adapter(self.active_adapters)
-
-    def get_delta_weight(self, adapter_name: str) -> mindspore.Tensor:
-        r"""
-        This method calculates the delta weight for a given adapter.
-        
-        Args:
-            self: The LoHaLayer object.
-            adapter_name (str): The name of the adapter for which to calculate the delta weight.
-        
-        Returns:
-            mindspore.Tensor: The delta weight tensor calculated for the specified adapter.
-        
-        Raises:
-            None.
-        
-        """
-        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L178
-        if adapter_name in self.hada_t1.keys():
-            weight = make_weight_cp(
-                self.hada_t1[adapter_name],
-                self.hada_w1_a[adapter_name],
-                self.hada_w1_b[adapter_name],
-                self.hada_t2[adapter_name],
-                self.hada_w2_a[adapter_name],
-                self.hada_w2_b[adapter_name],
-                scale=mindspore.tensor(self.scaling[adapter_name]),
-            )
-        else:
-            weight = make_weight(
-                self.hada_w1_a[adapter_name],
-                self.hada_w1_b[adapter_name],
-                self.hada_w2_a[adapter_name],
-                self.hada_w2_b[adapter_name],
-                scale=mindspore.tensor(self.scaling[adapter_name]),
-            )
-
-        base_layer = self.get_base_layer()
-        weight = weight.reshape(base_layer.weight.shape)
-
-        # Perform rank dropout during training - drop rows of addition weights
-        rank_dropout = self.rank_dropout[adapter_name]
-        if self.training and rank_dropout:
-            drop = (ops.rand(weight.size(0)) > rank_dropout).to(weight.dtype)
-            drop = drop.view(-1, *[1] * len(weight.shape[1:]))
-            # TODO: Investigate if there should be a scaler like in normal dropout during training
-            # Original implementation doesn't have it
-            # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L193
-            drop /= drop.mean()
-            weight *= drop
-        return weight
-
-    def forward(self, x: mindspore.Tensor, *args, **kwargs) -> mindspore.Tensor:
-        r"""
-        This method forwards the output tensor by applying various operations based on the input tensor and layer configurations.
-        
-        Args:
-            self: An instance of the LoHaLayer class.
-            x (mindspore.Tensor): The input tensor on which the operations will be applied.
-        
-        Returns:
-            mindspore.Tensor: The output tensor after applying the specified operations.
-        
-        Raises:
-            None
-        """
-        previous_dtype = x.dtype
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-
-            # Execute all the adapters
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self._available_adapters:
-                    continue
-
-                module_dropout = self.module_dropout[active_adapter]
-
-                # Modify current execution weights
-                if (not self.training) or (
-                    self.training and ops.rand(1) > module_dropout
-                ):
-                    result = result + self._get_delta_activations(
-                        active_adapter, x, *args, **kwargs
-                    )
-
-        result = result.to(previous_dtype)
-        return result
-
-
-class Linear(LoHaLayer):
-    """LoHa implemented in Linear layer"""
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
-        alpha: float = 0.0,
-        rank_dropout: float = 0.0,
-        module_dropout: float = 0.0,
-        init_weights: bool = True,
-        **kwargs,
-    ):
-        r"""
-        __init__
-        
-        Initializes the Linear class.
-        
-        Args:
-            self: The instance of the class itself.
-            base_layer (nn.Module): The base layer for the linear adapter.
-            adapter_name (str, optional): The name of the adapter. Defaults to 'default'.
-            r (int, optional): The value for r. Defaults to 0.
-            alpha (float, optional): The value for alpha. Defaults to 0.0.
-            rank_dropout (float, optional): The value for rank dropout. Defaults to 0.0.
-            module_dropout (float, optional): The value for module dropout. Defaults to 0.0.
-            init_weights (bool, optional): If True, initializes the weights. Defaults to True.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None
-        """
-        super().__init__(base_layer)
-
-        # Create adapter and set it active
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs
-        )
-
-    def _get_delta_activations(
-        self, adapter_name: str, input: mindspore.Tensor, *args: Any, **kwargs: Any
-    ) -> mindspore.Tensor:
-        r"""
-        Get the delta activations for the specified adapter and input.
-        
-        Args:
-            self (Linear): The Linear instance.
-            adapter_name (str): The name of the adapter.
-            input (mindspore.Tensor): The input tensor.
-        
-        Returns:
-            mindspore.Tensor: The delta activations tensor.
-        
-        Raises:
-            ValueError: If the adapter name is not valid.
-            TypeError: If the input tensor is not of type mindspore.Tensor.
-        """
-        delta_weight = self.get_delta_weight(adapter_name)
-        # don't add bias here, because the bias is already included in the output of the base_layer
-        return ops.dense(input, delta_weight)
-
-    def __repr__(self) -> str:
-        r"""
-        Docstring for __repr__ method in the class Linear.
-        
-        Args:
-            self: Linear object. Represents the instance of the Linear class.
-            
-        Returns:
-            str: A string representation of the Linear object with the prefix 'loha.' added to the default representation obtained using super().
-        
-        Raises:
-            No specific exceptions are raised within this method.
-        """
-        rep = super().__repr__()
-        return "loha." + rep
-
-
-class Conv2d(LoHaLayer):
-    """LoHa implemented in Conv2d layer"""
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
-        alpha: float = 0.0,
-        rank_dropout: float = 0.0,
-        module_dropout: float = 0.0,
-        use_effective_conv2d: bool = False,
-        init_weights: bool = True,
-        **kwargs,
-    ):
-        r"""
-        Initializes an instance of the Conv2d class.
-        
-        Args:
-            self: The instance of the class.
-            base_layer (nn.Module): The base layer used for convolutional operations.
-            adapter_name (str, optional): The name of the adapter. Defaults to 'default'.
-            r (int, optional): The value of r. Defaults to 0.
-            alpha (float, optional): The value of alpha. Defaults to 0.0.
-            rank_dropout (float, optional): The value of rank dropout. Defaults to 0.0.
-            module_dropout (float, optional): The value of module dropout. Defaults to 0.0.
-            use_effective_conv2d (bool, optional): Boolean flag indicating whether to use effective conv2d. Defaults to False.
-            init_weights (bool, optional): Boolean flag indicating whether to initialize the weights. Defaults to True.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        super().__init__(base_layer)
-
-        # Create adapter and set it active
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name,
-            r,
-            alpha,
-            rank_dropout,
-            module_dropout,
-            init_weights,
-            use_effective_conv2d,
-            **kwargs,
-        )
-
-    def _get_delta_activations(
-        self, adapter_name: str, input: mindspore.Tensor, *args: Any, **kwargs: Any
-    ) -> mindspore.Tensor:
-        r"""
-        This method calculates the delta activations for the given input tensor using the specified adapter_name.
-        
-        Args:
-            self (Conv2d): The instance of the Conv2d class.
-            adapter_name (str): The name of the adapter used to obtain the delta weight.
-            input (mindspore.Tensor): The input tensor for which the delta activations need to be calculated.
-        
-        Returns:
-            mindspore.Tensor: A tensor containing the delta activations calculated based on the input and delta weight.
-        
-        Raises:
-            - ValueError: If the adapter_name is invalid or not found.
-            - RuntimeError: If there is an issue in obtaining the delta weight or base layer.
-            - TypeError: If the input tensor is not of type mindspore.Tensor.
-        """
-        delta_weight = self.get_delta_weight(adapter_name)
-        # don't add bias here, because the bias is already included in the output of the base_layer
-        base_layer = self.get_base_layer()
-        return ops.conv2d(
-            input,
-            delta_weight,
-            stride=base_layer.stride,
-            padding=base_layer.padding,
-            dilation=base_layer.dilation,
-            groups=base_layer.groups,
-        )
-
-    def __repr__(self) -> str:
-        r"""
-        This method returns a string representation of the object.
-        
-        Args:
-            self: The instance of the Conv2d class.
-        
-        Returns:
-            str: A string representation of the object with the prefix 'loha.'.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        rep = super().__repr__()
-        return "loha." + rep
-
-
-# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9
-
-
-class HadaWeight(nn.Module):
-
-    r"""
-    The HadaWeight class represents a module that calculates the Hadamard product of two sets of weights in a neural network. 
-    
-    This class inherits from nn.Module and provides methods for forwarding the Hadamard product of two weight matrices, backpropagating through the operation, and computing the gradients with respect to the
-input weights. 
-    
-    The forward method computes the Hadamard product of two sets of weights scaled by a specified factor. 
-    
-    The bprop method calculates the gradients of the input weights with respect to the output gradients, considering the Hadamard product operation and the scaling factor. 
-    
-    Usage:
-        hw = HadaWeight()
-        diff_weight = hw.forward(w1a, w1b, w2a, w2b, scale)
-        grad_w1a, grad_w1b, grad_w2a, grad_w2b, scale = hw.bprop(w1a, w1b, w2a, w2b, scale, out, dout)
-    """
-    def forward(self, w1a, w1b, w2a, w2b, scale=mindspore.tensor(1)):
-        r"""
-        Constructs the Hadamard weight for the given inputs.
-        
-        Args:
-            self (HadaWeight): An instance of the HadaWeight class.
-            w1a (tensor): The first weight matrix (w1a) of shape (m, n).
-            w1b (tensor): The second weight matrix (w1b) of shape (n, p).
-            w2a (tensor): The third weight matrix (w2a) of shape (p, q).
-            w2b (tensor): The fourth weight matrix (w2b) of shape (q, r).
-            scale (tensor, optional): The scale factor to be applied to the result. Defaults to mindspore.tensor(1).
-        
-        Returns:
-            tensor: The resulting Hadamard weight matrix of shape (m, r).
-        
-        Raises:
-            TypeError: If any of the input parameters (w1a, w1b, w2a, w2b, scale) are not of type 'tensor'.
-            ValueError: If the shapes of the weight matrices (w1a, w1b, w2a, w2b) are not compatible.
-        """
-        diff_weight = ((w1a @ w1b) * (w2a @ w2b)) * scale
-        return diff_weight
-
-    def bprop(self, w1a, w1b, w2a, w2b, scale, out, dout):
-        r"""
-        This method, bprop, is a part of the HadaWeight class and is used for backpropagation calculations. It takes in the following parameters:
-        
-        Args:
-        - self: Represents the instance of the HadaWeight class.
-        - w1a: A numpy array representing weights for the first layer's input units.
-        - w1b: A numpy array representing weights for the first layer's output units.
-        - w2a: A numpy array representing weights for the second layer's input units.
-        - w2b: A numpy array representing weights for the second layer's output units.
-        - scale: A scalar value used for scaling the gradient.
-        - out: A numpy array representing the output of the forward pass.
-        - dout: A numpy array representing the gradient of the output layer.
-        
-        Returns:
-        - grad_w1a: A numpy array representing the gradient of the weights w1a.
-        - grad_w1b: A numpy array representing the gradient of the weights w1b.
-        - grad_w2a: A numpy array representing the gradient of the weights w2a.
-        - grad_w2b: A numpy array representing the gradient of the weights w2b.
-        - scale: A scalar value representing the updated scale.
-        
-        Raises:
-        - None
-        
-        """
-        dout = dout * scale
-        temp = dout * (w2a @ w2b)
-        grad_w1a = temp @ w1b.T
-        grad_w1b = w1a.T @ temp
-
-        temp = dout * (w1a @ w1b)
-        grad_w2a = temp @ w2b.T
-        grad_w2b = w2a.T @ temp
-
-        return grad_w1a, grad_w1b, grad_w2a, grad_w2b, scale
-
-
-class HadaWeightCP(nn.Module):
-
-    r"""
-    The HadaWeightCP class represents a cell for performing HadaWeightCP (Hadamard product with weight and channel permutation) operations. This class inherits from nn.Module and provides methods for
-forwarding the HadaWeightCP operation and its backward propagation.
-    
-    The forward method takes input tensors t1, w1a, w1b, t2, w2a, w2b, and optional scale, and returns the result of the HadaWeightCP operation. The HadaWeightCP operation involves performing einsum
-operations on the input tensors and scaling the result by the provided scale.
-    
-    The bprop method takes input tensors t1, w1a, w1b, t2, w2a, w2b, scale, out, and dout, and computes the gradients with respect to the input tensors and weight matrices. The method involves performing
-einsum operations and computing gradients for w1a, w1b, t1, w2a, w2b, and t2.
-    
-    This class is designed to be used as a building block for neural network models that involve HadaWeightCP operations and provides an efficient and optimized implementation for such operations.
-    """
-    def forward(self, t1, w1a, w1b, t2, w2a, w2b, scale=mindspore.tensor(1)):
-        r"""
-        Constructs a weighted tensor product using the HadaWeightCP method.
-        
-        Args:
-            self: An instance of the HadaWeightCP class.
-            t1: A tensor of shape (i, j, k, l), representing the first input tensor.
-            w1a: A tensor of shape (j, r), representing the first weight tensor (a).
-            w1b: A tensor of shape (i, p), representing the first weight tensor (b).
-            t2: A tensor of shape (i, j, k, l), representing the second input tensor.
-            w2a: A tensor of shape (j, r), representing the second weight tensor (a).
-            w2b: A tensor of shape (i, p), representing the second weight tensor (b).
-            scale: A tensor of shape (), representing the scaling factor applied to the product (default: 1).
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        rebuild1 = ops.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a)
-        rebuild2 = ops.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a)
-        return rebuild1 * rebuild2 * scale
-
-    def bprop(self, t1, w1a, w1b, t2, w2a, w2b, scale, out, dout):
-        r"""
-        This method calculates the backward propagation for the HadaWeightCP class.
-        
-        Args:
-            self (HadaWeightCP): An instance of the HadaWeightCP class.
-            t1 (numpy.ndarray): Input tensor 1.
-            w1a (numpy.ndarray): Weight tensor 1a.
-            w1b (numpy.ndarray): Weight tensor 1b.
-            t2 (numpy.ndarray): Input tensor 2.
-            w2a (numpy.ndarray): Weight tensor 2a.
-            w2b (numpy.ndarray): Weight tensor 2b.
-            scale (float): Scaling factor.
-            out (numpy.ndarray): Output tensor.
-            dout (numpy.ndarray): Gradient of the output tensor.
-        
-        Returns:
-            Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, None]:
-            - grad_t1 (numpy.ndarray): Gradient for input tensor 1.
-            - grad_w1a (numpy.ndarray): Gradient for weight tensor 1a.
-            - grad_w1b (numpy.ndarray): Gradient for weight tensor 1b.
-            - grad_t2 (numpy.ndarray): Gradient for input tensor 2.
-            - grad_w2a (numpy.ndarray): Gradient for weight tensor 2a.
-            - grad_w2b (numpy.ndarray): Gradient for weight tensor 2b.
-            - None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        dout = dout * scale
-
-        temp = ops.einsum("i j k l, j r -> i r k l", t2, w2b)
-        rebuild = ops.einsum("i j k l, i r -> r j k l", temp, None)
-
-        grad_w = rebuild * dout
-
-        grad_w1a = ops.einsum("r j k l, i j k l -> r i", temp, grad_w)
-        grad_temp = ops.einsum("i j k l, i r -> r j k l", grad_w, w1a.T)
-
-        grad_w1b = ops.einsum("i r k l, i j k l -> r j", t1, grad_temp)
-        grad_t1 = ops.einsum("i j k l, j r -> i r k l", grad_temp, w1b.T)
-
-        temp = ops.einsum("i j k l, j r -> i r k l", t1, w1b)
-        rebuild = ops.einsum("i j k l, i r -> r j k l", temp, w1a)
-
-        grad_w = rebuild * dout
-
-        grad_w2a = ops.einsum("r j k l, i j k l -> r i", temp, grad_w)
-        grad_temp = ops.einsum("i j k l, i r -> r j k l", grad_w, w2a.T)
-
-        grad_w2b = ops.einsum("i r k l, i j k l -> r j", t2, grad_temp)
-        grad_t2 = ops.einsum("i j k l, j r -> i r k l", grad_temp, w2b.T)
-        return (grad_t1, grad_w1a, grad_w1b, grad_t2, grad_w2a, grad_w2b, None)
-
-
-def make_weight(w1a, w1b, w2a, w2b, scale):
-    """
-    Args:
-        w1a (float): The weight value for the first item in the first set.
-        w1b (float): The weight value for the second item in the first set.
-        w2a (float): The weight value for the first item in the second set.
-        w2b (float): The weight value for the second item in the second set.
-        scale (float): The scale factor for the weights.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not explicitly raise any exceptions.
-    """
-    hadaweight = HadaWeight()
-    return hadaweight(w1a, w1b, w2a, w2b, scale)
-
-
-def make_weight_cp(t1, w1a, w1b, t2, w2a, w2b, scale):
-    r"""
-    This function takes in seven parameters: t1, w1a, w1b, t2, w2a, w2b, and scale.
-    
-    Args:
-        t1 (type): The first parameter representing some value.
-        w1a (type): The second parameter representing a weight value.
-        w1b (type): The third parameter representing another weight value.
-        t2 (type): The fourth parameter representing a different value.
-        w2a (type): The fifth parameter representing a weight value.
-        w2b (type): The sixth parameter representing another weight value.
-        scale (type): The seventh parameter representing a scaling factor.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    hadaweightcp = HadaWeightCP()
-    return hadaweightcp(t1, w1a, w1b, t2, w2a, w2b, scale)
diff --git a/mindnlp/peft/tuners/loha/model.py b/mindnlp/peft/tuners/loha/model.py
deleted file mode 100644
index d08a78fb7..000000000
--- a/mindnlp/peft/tuners/loha/model.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""loha model"""
-import re
-from itertools import chain
-from typing import Dict, Type, Union
-
-from mindnlp.core import nn
-
-from mindnlp.peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
-
-from .layer import Conv2d, Linear, LoHaLayer
-
-
-class LoHaModel(LycorisTuner):
-    """
-    Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in
-    https://arxiv.org/abs/2108.06098 Current implementation heavily borrows from
-    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py
-
-    Args:
-        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
-        config ([`LoHaConfig`]): The configuration of the LoHa model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        `torch.nn.Module`: The LoHa model.
-
-    Example:
-        ```py
-        >>> from diffusers import StableDiffusionPipeline
-        >>> from peft import LoHaModel, LoHaConfig
-
-        >>> config_te = LoHaConfig(
-        ...     r=8,
-        ...     lora_alpha=32,
-        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
-        ...     rank_dropout=0.0,
-        ...     module_dropout=0.0,
-        ...     init_weights=True,
-        ... )
-        >>> config_unet = LoHaConfig(
-        ...     r=8,
-        ...     lora_alpha=32,
-        ...     target_modules=[
-        ...         "proj_in",
-        ...         "proj_out",
-        ...         "to_k",
-        ...         "to_q",
-        ...         "to_v",
-        ...         "to_out.0",
-        ...         "ff.net.0.proj",
-        ...         "ff.net.2",
-        ...     ],
-        ...     rank_dropout=0.0,
-        ...     module_dropout=0.0,
-        ...     init_weights=True,
-        ...     use_effective_conv2d=True,
-        ... )
-
-        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> model.text_encoder = LoHaModel(model.text_encoder, config_te, "default")
-        >>> model.unet = LoHaModel(model.unet, config_unet, "default")
-        ```
-
-    **Attributes**:
-        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
-        - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model.
-    """
-    prefix: str = "hada_"
-    layers_mapping: Dict[Type[nn.Module], Type[LoHaLayer]] = {
-        nn.Conv2d: Conv2d,
-        nn.Linear: Linear,
-    }
-
-    def _create_and_replace(
-        self,
-        config: LycorisConfig,
-        adapter_name: str,
-        target: Union[LoHaLayer, nn.Module],
-        target_name: str,
-        parent: nn.Module,
-        current_key: str,
-    ) -> None:
-        """
-        A private method to create and replace the target module with the adapter module.
-        """
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(
-            chain(config.rank_pattern.keys(), config.alpha_pattern.keys())
-        )
-        target_name_key = next(
-            filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys),
-            target_name,
-        )
-
-        kwargs = config.to_dict()
-        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
-        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
-        if isinstance(target, LoHaLayer):
-            target.update_layer(adapter_name, **kwargs)
-        else:
-            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
diff --git a/mindnlp/peft/tuners/lokr/__init__.py b/mindnlp/peft/tuners/lokr/__init__.py
deleted file mode 100644
index 1d1e73ae3..000000000
--- a/mindnlp/peft/tuners/lokr/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lokr."""
-
-from .config import LoKrConfig
-from .layer import Conv2d, Dense, LoKrLayer
-from .model import LoKrModel
-
-
-__all__ = ["LoKrConfig", "LoKrModel", "Conv2d", "Dense", "LoKrLayer"]
diff --git a/mindnlp/peft/tuners/lokr/config.py b/mindnlp/peft/tuners/lokr/config.py
deleted file mode 100644
index 2e66a83db..000000000
--- a/mindnlp/peft/tuners/lokr/config.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""lokr."""
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-
-from ...config import PeftConfig
-from ...utils import PeftType
-
-
-@dataclass
-class LoKrConfig(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a [`LoraModel`].
-
-    Args:
-        r (`int`): lokr attention dimension.
-        target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
-        lora_alpha (`float`): The alpha parameter for Lokr scaling.
-        rank_dropout (`float`):The dropout probability for rank dimension during training.
-        module_dropout (`float`): The dropout probability for LoKR layers.
-        use_effective_conv2d (`bool`):
-            Use parameter effective decomposition for
-            Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
-        decompose_both (`bool`):Perform rank decomposition of left kronecker product matrix.
-        decompose_factor (`int`):Kronecker product decomposition factor.
-
-        bias (`str`): Bias type for Lora. Can be 'none', 'all' or 'lora_only'
-        modules_to_save (`List[str]`):
-            List of modules apart from LoRA layers to be set as trainable
-            and saved in the final checkpoint.
-        init_weights (`bool`):
-            Whether to perform initialization of adapter weights. This defaults to `True`, 
-            passing `False` is discouraged.
-        layers_to_transform (`Union[List[int],int]`):
-            The layer indexes to transform, if this argument is specified, it will apply the LoRA transformations on
-            the layer indexes that are specified in this list. If a single integer is passed, it will apply the LoRA
-            transformations on the layer at this index.
-        layers_pattern (`str`):
-            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
-            pattern is not in the common layers pattern.
-        rank_pattern (`dict`):
-            The mapping from layer names or regexp expression to ranks which are different from the default rank
-            specified by `r`.
-        alpha_pattern (`dict`):
-            The mapping from layer names or regexp expression to alphas which are different from the default alpha
-            specified by `alpha`.
-    """
-    r: int = field(default=8, metadata={"help": "lokr attention dimension"})
-    target_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of cell names or regex expression of the cell names to replace with Lora."
-            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
-        },
-    )
-    lora_alpha: int = field(default=8, metadata={"help": "lokr alpha"})
-    rank_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for rank dimension during training"},
-    )
-    module_dropout: float = field(default=0.0, metadata={"help": "lokr dropout"})
-    use_effective_conv2d: bool = field(
-        default=False,
-        metadata={
-            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
-        },
-    )
-    decompose_both: bool = field(
-        default=False,
-        metadata={
-            "help": "Perform rank decomposition of left kronecker product matrix."
-        },
-    )
-    decompose_factor: int = field(
-        default=-1, metadata={"help": "Kronecker product decomposition factor."}
-    )
-
-    bias: str = field(
-        default="none",
-        metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"},
-    )
-    modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-    init_weights: bool = field(
-        default=True,
-        metadata={"help": "Whether to initialize the weights of the Lora layers."},
-    )
-    layers_to_transform: Optional[Union[List, int]] = field(
-        default=None,
-        metadata={
-            "help": "The layer indexes to transform, is this argument is specified, \
-                PEFT will transform only the layers indexes that are specified inside this list. \
-                If a single integer is passed, PEFT will transform only the layer at this index."
-        },
-    )
-    layers_pattern: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and \
-                  if the layer pattern is not in the common layers pattern."
-        },
-    )
-    rank_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
-            )
-        },
-    )
-    alpha_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
-            )
-        },
-    )
-
-    def __post_init__(self):
-        r"""
-        Method to initialize the attributes of the LoKrConfig class after object creation.
-        
-        Args:
-            self: Instance of the LoKrConfig class.
-        
-        Returns:
-            None. This method performs attribute initialization within the class.
-        
-        Raises:
-            No specific exceptions are raised within this method.
-        """
-        self.peft_type = PeftType.LOKR
-
-    @property
-    def is_prompt_learning(self):
-        r"""
-        Utility method to check if the configuration is for prompt learning.
-        """
-        return False
diff --git a/mindnlp/peft/tuners/lokr/layer.py b/mindnlp/peft/tuners/lokr/layer.py
deleted file mode 100644
index d7d9557e2..000000000
--- a/mindnlp/peft/tuners/lokr/layer.py
+++ /dev/null
@@ -1,1023 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Lokr."""
-import math
-import warnings
-from typing import List, Optional, Union, Any, Set, Tuple
-from abc import abstractmethod
-
-import mindspore as ms
-from mindspore.common.initializer import initializer, HeUniform, Zero
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import ParameterDict
-
-from ..tuners_utils import (
-    BaseTunerLayer,
-    check_adapters_to_merge,
-)
-
-
-class LoKrLayer(nn.Module, BaseTunerLayer):
-
-    r"""
-    LoKrLayer is a custom PyTorch class representing a layer that implements the Locally Kroneckerized Neural Network adaptation technique. This technique allows for adaptive modifications to be made on top of
-the base layer's output. The class provides methods for creating, updating, merging, unmerging, and managing adaptive layers within the network.
-    
-    Attributes:
-        - lokr_w1: Dictionary of parameters for the first adaptive layer.
-        - lokr_w1_a: Dictionary of parameters for the first adaptive layer (alternative).
-        - lokr_w1_b: Dictionary of parameters for the first adaptive layer (alternative).
-        - lokr_w2: Dictionary of parameters for the second adaptive layer.
-        - lokr_w2_a: Dictionary of parameters for the second adaptive layer (alternative).
-        - lokr_w2_b: Dictionary of parameters for the second adaptive layer (alternative).
-        - lokr_t2: Dictionary of parameters for the second adaptive layer (tensor version).
-        - base_layer: The base layer on which the adaptive modifications are applied.
-        - r: Dictionary storing the rank values for each adapter.
-        - alpha: Dictionary storing alpha values for each adapter.
-        - scaling: Dictionary storing scaling values for each adapter.
-        - rank_dropout: Dictionary storing rank dropout probabilities for each adapter.
-        - module_dropout: Dictionary storing cell dropout probabilities for each adapter.
-        - _disable_adapters: Boolean flag indicating whether adapters are disabled.
-        - merged_adapters: List of names of merged adapters.
-    
-    Methods:
-        - _get_delta_activations: Abstract method to retrieve activations added on top of the base layer output.
-        - _available_adapters: Property returning a set of available adapter names.
-        - active_adapter: Property returning the name of the active adapter.
-        - disable_adapters: Property returning a boolean flag indicating whether adapters are disabled.
-        - merged: Property returning a boolean value indicating if any adapters are merged.
-        - active_adapters: Property returning a list of active adapter names.
-        - get_base_layer: Method to recursively retrieve the base layer.
-        - create_adapter_parameters: Method to create adapter parameters based on input configurations.
-        - reset_adapter_parameters: Method to reset adapter parameters to initial values.
-        - reset_adapter_parameters_random: Method to reset adapter parameters to random initial values.
-        - update_layer: Method to update the layer with a new adapter based on specified parameters.
-        - set_adapter: Method to set the active adapter(s) and mark them as trainable.
-        - merge: Method to merge active adapter weights into the base weights.
-        - unmerge: Method to unmerge all merged adapter layers from the base weights.
-        - get_delta_weight: Method to calculate the delta weight for a specific adapter.
-        - forward: Method to forward the output of the layer with adaptive modifications applied.
-    
-    Note:
-        This class is intended for advanced neural network adaptation techniques and should be used in conjunction with PyTorch's nn.Module functionalities.
-    """
-    other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
-    # All names of layers that may contain adapter weights
-    adapter_layer_names = (
-        "lokr_w1",
-        "lokr_w1_a",
-        "lokr_w1_b",
-        "lokr_w2",
-        "lokr_w2_a",
-        "lokr_w2_b",
-        "lokr_t2",
-    )
-    r"""
-    A tuner layer mixin that provides the common methods and attributes for all tuners.
-
-    Args:
-        is_pluggable (`bool`, *optional*):
-            Whether the adapter layer can be plugged to any pytorch cell
-        active_adapters (Union[List[`str`], `str`], *optional*):
-            The name of the active adapter.
-    """
-
-    # indicates whether all adapters should be disabled
-    _disable_adapters: bool = False
-
-    # the currently active adapter(s)
-    _active_adapter: Union[str, List[str]]
-
-    # List all merged adapters
-    merged_adapters: "List[str]" = []
-
-    def __init__(self, base_layer: nn.Module) -> None:
-        r"""
-        This method initializes an instance of the LoKrLayer class.
-        
-        Args:
-            self: The instance of the LoKrLayer class.
-            base_layer (nn.Module): The base layer used for the LoKrLayer.
-            
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        # LoKr info
-        self.lokr_w1 = ParameterDict({})
-        self.lokr_w1_a = ParameterDict({})
-        self.lokr_w1_b = ParameterDict({})
-        self.lokr_w2 = ParameterDict({})
-        self.lokr_w2_a = ParameterDict({})
-        self.lokr_w2_b = ParameterDict({})
-        self.lokr_t2 = ParameterDict({})
-        self.base_layer = base_layer
-        self.r = {}
-        self.alpha = {}
-        self.scaling = {}
-        self.rank_dropout = {}
-        self.module_dropout = {}
-
-        # Tuner info
-        self._disable_adapters = False
-        self.merged_adapters = []
-
-    @abstractmethod
-    def _get_delta_activations(
-        self, adapter_name: str, x: ms.Tensor, *args: Any, **kwargs: Any
-    ) -> ms.Tensor:
-        """Activations added on top of the base layer output (i.e. after the base layer forward pass)"""
-    @property
-    def _available_adapters(self) -> Set[str]:
-        r"""
-        Method to retrieve the set of available adapters.
-        
-        Args:
-            self (LoKrLayer): The instance of the LoKrLayer class.
-                This parameter represents the current instance of the LoKrLayer class.
-                
-        Returns:
-            Set[str]: A set containing strings representing available adapters.
-                The set includes available adapters from different sources within the LoKrLayer instance.
-                
-        Raises:
-            None
-        """
-        return {
-            *self.lokr_w1,
-            *self.lokr_w1_a,
-            *self.lokr_w1_b,
-            *self.lokr_w2,
-            *self.lokr_w2_a,
-            *self.lokr_w2_b,
-            *self.lokr_t2,
-        }
-
-    @property
-    def active_adapter(self) -> str:
-        r"""
-        This method returns the active adapter.
-        
-        Args:
-            self: Instance of the LoKrLayer class.
-        
-        Returns:
-            str: The active adapter as a string.
-        
-        Raises:
-            None
-        """
-        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
-        return self._active_adapter
-
-    @property
-    def disable_adapters(self) -> bool:
-        r"""
-        Disables the adapters in the LoKrLayer.
-        
-        Args:
-            self: An instance of the LoKrLayer class.
-        
-        Returns:
-            bool: True if the adapters are successfully disabled, False otherwise.
-        
-        Raises:
-            None.
-        """
-        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
-        return self._disable_adapters
-
-    @property
-    def merged(self) -> bool:
-        r"""
-        Returns a boolean value indicating whether the 'LoKrLayer' instance has any merged adapters.
-        
-        Args:
-            self: The current instance of 'LoKrLayer'.
-        
-        Returns:
-            bool: True if the 'LoKrLayer' instance has merged adapters, False otherwise.
-        
-        Raises:
-            None.
-        """
-        return bool(self.merged_adapters)
-
-    @property
-    def active_adapters(self):
-        r"""
-        This method 'active_adapters' in the class 'LoKrLayer' retrieves the active adapters.
-        
-        Args:
-            self: The instance of the 'LoKrLayer' class.
-            
-        Returns:
-            If the 'active_adapter' attribute of the instance is a string, this method returns a list containing that string.
-            If the 'active_adapter' attribute of the instance is not a string, the method returns the 'active_adapter' attribute itself.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        if isinstance(self.active_adapter, str):
-            return [self.active_adapter]
-        # is already a list of str
-        return self.active_adapter
-
-    def get_base_layer(self) -> nn.Module:
-        """
-        (Recursively) get the base_layer.
-
-        This is necessary for the case that the tuner layer wraps another tuner layer.
-
-        """
-        base_layer = self
-        while hasattr(base_layer, "base_layer"):
-            base_layer = base_layer.base_layer
-        return base_layer
-
-    def create_adapter_parameters(
-        self,
-        adapter_name: str,
-        r: int,
-        shape,
-        use_w1: bool,
-        use_w2: bool,
-        use_effective_conv2d: bool,
-    ):
-        r"""Create adapter parameters for the LoKrLayer class.
-        
-        This method creates and initializes adapter parameters based on the provided arguments. The adapter parameters are used for the LoKrLayer class.
-        
-        Args:
-            self (LoKrLayer): The instance of the LoKrLayer class.
-            adapter_name (str): The name of the adapter.
-            r (int): The value of 'r' used for parameter initialization.
-            shape: The shape of the parameters. It can be a tuple or a list of tuples, depending on the number of dimensions.
-            use_w1 (bool): A flag indicating whether to use the 'w1' parameter.
-            use_w2 (bool): A flag indicating whether to use the 'w2' parameter.
-            use_effective_conv2d (bool): A flag indicating whether to use the 'effective_conv2d' parameter.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        if use_w1:
-            self.lokr_w1[adapter_name] = ms.Parameter(
-                ops.zeros((shape[0][0], shape[1][0]))
-            )
-        else:
-            self.lokr_w1_a[adapter_name] = ms.Parameter(ops.zeros((shape[0][0], r)))
-            self.lokr_w1_b[adapter_name] = ms.Parameter(ops.zeros((r, shape[1][0])))
-
-        if len(shape) == 4:
-            # Conv2d
-            if use_w2:
-                self.lokr_w2[adapter_name] = ms.Parameter(
-                    ops.zeros((shape[0][1], shape[1][1], *shape[2:]))
-                )
-            elif use_effective_conv2d:
-                self.lokr_t2[adapter_name] = ms.Parameter(
-                    ops.zeros((r, r, shape[2], shape[3]))
-                )
-                self.lokr_w2_a[adapter_name] = ms.Parameter(
-                    ops.zeros((r, shape[0][1]))
-                )  # b, 1-mode
-                self.lokr_w2_b[adapter_name] = ms.Parameter(
-                    ops.zeros((r, shape[1][1]))
-                )  # d, 2-mode
-            else:
-                self.lokr_w2_a[adapter_name] = ms.Parameter(ops.zeros((shape[0][1], r)))
-                self.lokr_w2_b[adapter_name] = ms.Parameter(
-                    ops.zeros((r, shape[1][1] * shape[2] * shape[3]))
-                )
-        else:
-            # Linear
-            if use_w2:
-                self.lokr_w2[adapter_name] = ms.Parameter(
-                    ops.zeros((shape[0][1], shape[1][1]))
-                )
-            else:
-                self.lokr_w2_a[adapter_name] = ms.Parameter(ops.zeros((shape[0][1], r)))
-                self.lokr_w2_b[adapter_name] = ms.Parameter(ops.zeros((r, shape[1][1])))
-
-    def reset_adapter_parameters(self, adapter_name: str):
-        r"""
-        Reset the parameters of the specified adapter within the LoKrLayer.
-        
-        Args:
-            self (LoKrLayer): The instance of the LoKrLayer class.
-            adapter_name (str): The name of the adapter whose parameters are to be reset.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            KeyError: If the specified adapter_name is not found within the LoKrLayer attributes.
-            ValueError: If the specified adapter_name is not found within the LoKrLayer attributes and no corresponding backup attributes exist.
-        """
-        if adapter_name in self.lokr_w1:
-            # nn.init.zeros_(self.lokr_w1[adapter_name])
-            self.lokr_w1[adapter_name].assign_value(
-                initializer(
-                    Zero(),
-                    self.lokr_w1[adapter_name].shape,
-                    self.lokr_w1[adapter_name].dtype,
-                )
-            )
-
-        else:
-            # nn.init.zeros_(self.lokr_w1_a[adapter_name])
-            self.lokr_w1_a[adapter_name].assign_value(
-                initializer(
-                    Zero(),
-                    self.lokr_w1[adapter_name].shape,
-                    self.lokr_w1[adapter_name].dtype,
-                )
-            )
-            # nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
-            self.lokr_w1_b[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w1_b[adapter_name].shape,
-                    self.lokr_w1_b[adapter_name].dtype,
-                )
-            )
-        if adapter_name in self.lokr_w2:
-            # nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
-            self.lokr_w2[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2[adapter_name].shape,
-                    self.lokr_w2[adapter_name].dtype,
-                )
-            )
-        else:
-            # nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
-            self.lokr_w2_a[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2_a[adapter_name].shape,
-                    self.lokr_w2_a[adapter_name].dtype,
-                )
-            )
-            # nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
-            self.lokr_w2_b[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2_b[adapter_name].shape,
-                    self.lokr_w2_b[adapter_name].dtype,
-                )
-            )
-
-        if adapter_name in self.lokr_t2:
-            # nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
-            self.lokr_t2[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_t2[adapter_name].shape,
-                    self.lokr_t2[adapter_name].dtype,
-                )
-            )
-
-    def reset_adapter_parameters_random(self, adapter_name: str):
-        r"""
-        Resets the adapter parameters randomly for the specified adapter in the LoKrLayer class.
-        
-        Args:
-            self: The instance of the LoKrLayer class.
-            adapter_name (str): The name of the adapter to reset.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        This method resets the adapter parameters randomly based on the adapter name provided. If the adapter name is found in the self.lokr_w1 dictionary, the self.lokr_w1[adapter_name] parameter is reset
-using HeUniform initialization with a negative slope of the square root of 5. If the adapter name is not found in the self.lokr_w1 dictionary, the self.lokr_w1_a[adapter_name] and self.lokr_w1_b[adapter_name]
-parameters are reset using the same initialization.
-        
-        Similarly, the self.lokr_w2 and self.lokr_t2 parameters are reset based on the adapter name. If the adapter name is found in the self.lokr_w2 dictionary, the self.lokr_w2[adapter_name] parameter is
-reset using HeUniform initialization. If the adapter name is not found in the self.lokr_w2 dictionary, the self.lokr_w2_a[adapter_name] and self.lokr_w2_b[adapter_name] parameters are reset using the same
-initialization.
-        
-        Note: This method assumes that the initializer and HeUniform functions are defined and available.
-        """
-        if adapter_name in self.lokr_w1:
-            # nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5))
-            self.lokr_w1[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w1[adapter_name].shape,
-                    self.lokr_w1[adapter_name].dtype,
-                )
-            )
-        else:
-            # nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5))
-            self.lokr_w1_a[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w1_a[adapter_name].shape,
-                    self.lokr_w1_a[adapter_name].dtype,
-                )
-            )
-            # nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
-            self.lokr_w1_b[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w1_b[adapter_name].shape,
-                    self.lokr_w1_b[adapter_name].dtype,
-                )
-            )
-
-        if adapter_name in self.lokr_w2:
-            # nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
-            self.lokr_w2[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2[adapter_name].shape,
-                    self.lokr_w2[adapter_name].dtype,
-                )
-            )
-        else:
-            # nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
-            self.lokr_w2_a[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2_a[adapter_name].shape,
-                    self.lokr_w2_a[adapter_name].dtype,
-                )
-            )
-            # nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
-            self.lokr_w2_b[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_w2_b[adapter_name].shape,
-                    self.lokr_w2_b[adapter_name].dtype,
-                )
-            )
-
-        if adapter_name in self.lokr_t2:
-            # nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
-            self.lokr_t2[adapter_name].assign_value(
-                initializer(
-                    HeUniform(negative_slope=math.sqrt(5)),
-                    self.lokr_t2[adapter_name].shape,
-                    self.lokr_t2[adapter_name].dtype,
-                )
-            )
-
-    def update_layer(
-        self,
-        adapter_name: str,
-        r: int,
-        alpha: float,
-        rank_dropout: float,
-        module_dropout: float,
-        init_weights: bool,
-        use_effective_conv2d: bool,
-        decompose_both: bool,
-        decompose_factor: int,
-        **kwargs,
-    ) -> None:
-        """Internal function to create lokr adapter
-
-        Args:
-            adapter_name (`str`): Name for the adapter to add.
-            r (`int`): Rank for the added adapter.
-            alpha (`float`): Alpha for the added adapter.
-            rank_dropout (`float`): The dropout probability for rank dimension during training
-            module_dropout (`float`): The dropout probability for disabling adapter during training.
-            init_weights (`bool`): Whether to initialize adapter weights.
-            use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1.
-            decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
-            decompose_factor (`int`): Kronecker product decomposition factor.
-        """
-        if r <= 0:
-            raise ValueError(
-                f"`r` should be a positive integer value but the value passed is {r}"
-            )
-
-        self.r[adapter_name] = r
-        self.alpha[adapter_name] = alpha
-        self.scaling[adapter_name] = alpha / r
-        self.rank_dropout[adapter_name] = rank_dropout
-        self.module_dropout[adapter_name] = module_dropout
-        base_layer = self.get_base_layer()
-
-        # Determine shape of LoKr weights
-        if isinstance(base_layer, nn.Linear):
-            in_dim, out_dim = base_layer.in_features, base_layer.out_features
-
-            in_m, in_n = factorization(in_dim, decompose_factor)
-            out_l, out_k = factorization(out_dim, decompose_factor)
-            shape = (
-                (out_l, out_k),
-                (in_m, in_n),
-            )  # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d
-
-            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
-            use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2)
-            use_effective_conv2d = False
-        elif isinstance(base_layer, nn.Conv2d):
-            in_dim, out_dim = base_layer.in_channels, base_layer.out_channels
-            k_size = base_layer.kernel_size
-
-            in_m, in_n = factorization(in_dim, decompose_factor)
-            out_l, out_k = factorization(out_dim, decompose_factor)
-            shape = ((out_l, out_k), (in_m, in_n), *k_size)  # ((a, b), (c, d), *k_size)
-
-            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
-            use_w2 = r >= max(shape[0][1], shape[1][1]) / 2
-            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (
-                1,
-                1,
-            )
-        else:
-            raise TypeError(
-                f"LoKr is not implemented for base layers of type {type(base_layer).__name__}"
-            )
-
-        # Create weights with provided shape
-        self.create_adapter_parameters(
-            adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d
-        )
-
-        # Initialize weights
-        if init_weights:
-            self.reset_adapter_parameters(adapter_name)
-        else:
-            self.reset_adapter_parameters_random(adapter_name)
-
-        self.set_adapter(self.active_adapters)
-
-    def set_adapter(self, adapter_names) -> None:
-        """Set the active adapter(s).
-
-        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.named_parameters():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated.
-        """
-        if isinstance(adapter_names, str):
-            adapter_names = [adapter_names]
-
-        # Deactivate grads on the inactive adapter and activate grads on the active adapter
-        for layer_name in self.adapter_layer_names:
-            module_dict = getattr(self, layer_name)
-            for key, layer in module_dict.items():
-                if key in adapter_names:
-                    # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
-                    # happen if a completely different adapter layer is being activated.
-                    layer.requires_grad = True
-                else:
-                    layer.requires_grad = False
-
-        self._active_adapter = adapter_names
-
-    def merge(
-        self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
-    ) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
-                Defaults to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self._available_adapters:
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    orig_weights = base_layer.weight.data
-                    orig_weights += self.get_delta_weight(active_adapter)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = orig_weights
-                else:
-                    base_layer.weight.data += self.get_delta_weight(active_adapter)
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self._available_adapters:
-                self.get_base_layer().weight.data -= self.get_delta_weight(
-                    active_adapter
-                )
-
-    def get_delta_weight(self, adapter_name: str) -> ms.Tensor:
-        r"""
-        This method calculates the delta weight for a given adapter.
-        
-        Args:
-            self: The instance of the LoKrLayer class.
-            adapter_name (str): The name of the adapter for which the delta weight is to be calculated. It is a required parameter.
-        
-        Returns:
-            ms.Tensor: Returns a tensor representing the delta weight calculated for the specified adapter.
-        
-        Raises:
-            ValueError: If the adapter_name is not found in the internal data structures.
-            RuntimeError: If an error occurs during the calculation of the delta weight.
-            TypeError: If the input data types are incorrect or incompatible.
-        """
-        # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224
-        if adapter_name in self.lokr_w1:
-            w1 = self.lokr_w1[adapter_name]
-        else:
-            w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name]
-
-        if adapter_name in self.lokr_w2:
-            w2 = self.lokr_w2[adapter_name]
-        elif adapter_name in self.lokr_t2:
-            w2 = make_weight_cp(
-                self.lokr_t2[adapter_name],
-                self.lokr_w2_a[adapter_name],
-                self.lokr_w2_b[adapter_name],
-            )
-        else:
-            w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name]
-
-        # Make weights with Kronecker product
-        weight = make_kron(w1, w2)
-        weight = weight.reshape(self.get_base_layer().weight.shape)
-
-        # Perform rank dropout during training - drop rows of addition weights
-        rank_dropout = self.rank_dropout[adapter_name]
-        if self.training and rank_dropout:
-            drop = (ops.rand(weight.size(0)) > rank_dropout).float()
-            drop = drop.view(-1, *[1] * len(weight.shape[1:]))
-            drop /= drop.mean()
-            weight *= drop
-
-        return weight
-
-    def forward(self, x: ms.Tensor, *args, **kwargs) -> ms.Tensor:
-        """
-        Constructs the output tensor using the specified input tensor and additional arguments.
-        
-        Args:
-            self (LoKrLayer): The instance of the LoKrLayer class.
-            x (ms.Tensor): The input tensor to be processed.
-            
-        Returns:
-            ms.Tensor: The output tensor forwarded based on the input tensor and additional arguments.
-        
-        Raises:
-            TypeError: If the input tensor x is not of type ms.Tensor.
-            ValueError: If the input tensor x has an unsupported dtype.
-        """
-        previous_dtype = x.dtype
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-
-            # Execute all the adapters
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self._available_adapters:
-                    continue
-
-                module_dropout = self.module_dropout[active_adapter]
-
-                # Modify current execution weights
-                if (not self.training) or (
-                    self.training and ops.rand(1) > module_dropout
-                ):
-                    result = result + self._get_delta_activations(
-                        active_adapter, x, *args, **kwargs
-                    )
-
-        result = result.to(previous_dtype)
-        return result
-
-
-class Dense(LoKrLayer):
-    """LoKr implemented in Dense layer"""
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
-        alpha: float = 0.0,
-        rank_dropout: float = 0.0,
-        module_dropout: float = 0.0,
-        init_weights: bool = True,
-        **kwargs,
-    ):
-        """
-        Initializes a new instance of the Dense class.
-        
-        Args:
-            self: The object itself.
-            base_layer (nn.Module): The base layer for the adapter.
-            adapter_name (str): The name of the adapter. Defaults to 'default'.
-            r (int): The value of r for adapter update. Defaults to 0.
-            alpha (float): The value of alpha for adapter update. Defaults to 0.0.
-            rank_dropout (float): The dropout value for rank. Defaults to 0.0.
-            module_dropout (float): The dropout value for cell. Defaults to 0.0.
-            init_weights (bool): A flag to initialize weights. Defaults to True.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            <List of exceptions that the function may raise, if any>
-        """
-        super().__init__(base_layer)
-
-        # Create adapter and set it active
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs
-        )
-
-    def _get_delta_activations(
-        self, adapter_name: str, input: ms.Tensor, *args: Any, **kwargs: Any
-    ) -> ms.Tensor:
-        """
-        Method to calculate the delta activations for a given adapter.
-        
-        Args:
-            self: The instance of the Dense class.
-            adapter_name (str): The name of the adapter to retrieve delta weight for.
-            input (ms.Tensor): The input tensor for which delta activations are calculated.
-            *args: Additional positional arguments.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            ms.Tensor: The calculated delta activations as a tensor.
-        
-        Raises:
-            (Exception): If there is an error in retrieving the delta weight or in performing the dense operation.
-        """
-        delta_weight = self.get_delta_weight(
-            adapter_name
-        )  # Forced synchronization of parameter types, dangerous operation
-        # don't add bias here, because the bias is already included in the output of the base_layer
-        return ops.dense(input, delta_weight)
-
-    def __repr__(self) -> str:
-        r"""
-        This method returns a string representation of the object.
-        
-        Args:
-            self (Dense): The instance of the Dense class.
-            
-        Returns:
-            str: A string representation of the object prefixed with 'lokr.'.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        rep = super().__repr__()
-        return "lokr." + rep
-
-
-class Conv2d(LoKrLayer):
-    """LoKr implemented in Conv2d layer"""
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str = "default",
-        r: int = 0,
-        alpha: float = 0.0,
-        rank_dropout: float = 0.0,
-        module_dropout: float = 0.0,
-        use_effective_conv2d: bool = False,
-        init_weights: bool = True,
-        **kwargs,
-    ):
-        r"""
-        Initializes an instance of the Conv2d class.
-        
-        Args:
-            self: The instance of the Conv2d class.
-            base_layer (nn.Module): The base layer that the adapter will be added on top of.
-            adapter_name (str): The name of the adapter. Defaults to 'default'.
-            r (int): The value of parameter 'r'.
-            alpha (float): The value of parameter 'alpha'.
-            rank_dropout (float): The value of rank dropout.
-            module_dropout (float): The value of cell dropout.
-            use_effective_conv2d (bool): Flag indicating whether to use effective Conv2d.
-            init_weights (bool): Flag indicating whether to initialize weights.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None specified.
-        """
-        super().__init__(base_layer)
-
-        # Create adapter and set it active
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name,
-            r,
-            alpha,
-            rank_dropout,
-            module_dropout,
-            init_weights,
-            use_effective_conv2d,
-            **kwargs,
-        )
-
-    def _get_delta_activations(
-        self, adapter_name: str, input: ms.Tensor, *args: Any, **kwargs: Any
-    ) -> ms.Tensor:
-        r"""
-        Method to calculate delta activations for Conv2d layer.
-        
-        Args:
-            self (Conv2d): The instance of the Conv2d class.
-            adapter_name (str): The name of the adapter used for getting delta weight.
-            input (ms.Tensor): The input tensor for the convolution operation.
-        
-        Returns:
-            ms.Tensor: Returns the delta activations tensor calculated based on the input, delta weight, and base layer parameters.
-        
-        Raises:
-            - KeyError: If the provided adapter_name does not exist.
-            - ValueError: If there are issues with the input tensor shape or data.
-            - RuntimeError: If there are runtime issues during the convolution operation.
-        """
-        delta_weight = self.get_delta_weight(adapter_name)
-        # don't add bias here, because the bias is already included in the output of the base_layer
-        base_layer = self.get_base_layer()
-        return ops.conv2d(
-            input,
-            delta_weight,
-            stride=base_layer.stride,
-            padding=base_layer.padding,
-            dilation=base_layer.dilation,
-            groups=base_layer.group,
-        )
-
-    def __repr__(self) -> str:
-        r"""
-        Return a string representation of the 'Conv2d' object.
-        
-        Args:
-            self: The 'Conv2d' object itself.
-        
-        Returns:
-            A string representation of the 'Conv2d' object, prefixed with 'lokr.'.
-        
-        Raises:
-            None
-        
-        Example:
-            >>> conv = Conv2d()
-            >>> repr(conv)
-            'lokr.Conv2d()'
-        """
-        rep = super().__repr__()
-        return "lokr." + rep
-
-
-def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]:
-    """Factorizes the provided number into the product of two numbers
-
-    Args:
-        dimension (`int`): The number that needs to be factorized.
-        factor (`int`, optional):
-            Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the
-            factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the
-            square root of the dimension. Defaults to -1.
-
-    Returns:
-        Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is
-        always less than or equal to the second.
-
-    Example:
-        ```py
-        >>> factorization(256, factor=-1)
-        (16, 16)
-
-        >>> factorization(128, factor=-1)
-        (8, 16)
-
-        >>> factorization(127, factor=-1)
-        (1, 127)
-
-        >>> factorization(128, factor=4)
-        (4, 32)
-        ```
-    """
-    if factor > 0 and (dimension % factor) == 0:
-        m = factor
-        n = dimension // factor
-        return m, n
-    if factor == -1:
-        factor = dimension
-    m, n = 1, dimension
-    length = m + n
-    while m < n:
-        new_m = m + 1
-        while dimension % new_m != 0:
-            new_m += 1
-        new_n = dimension // new_m
-        if new_m + new_n > length or new_m > factor:
-            break
-        else:
-            m, n = new_m, new_n
-    if m > n:
-        n, m = m, n
-    return m, n
-
-
-def make_weight_cp(t, wa, wb):
-    r"""
-    This function creates a weight tensor by performing the contraction of four-dimensional tensor 't' with two matrices 'wa' and 'wb' along specific dimensions.
-    
-    Args:
-        t (ndarray): A four-dimensional tensor with shape (i, j, k, l), where i, j, k, l represent the dimensions of the tensor. The tensor serves as the base for the contraction operation.
-        wa (ndarray): A matrix with shape (i, p), where i represents the dimension matching the first dimension of 't', and p represents the desired dimension of the resulting tensor along the first axis.
-        wb (ndarray): A matrix with shape (j, r), where j represents the dimension matching the second dimension of 't', and r represents the desired dimension of the resulting tensor along the second axis.
-    
-    Returns:
-        ndarray: The resulting weight tensor after performing the contraction operation. The shape of the output tensor is (p, r, k, l), where p and r represent the dimensions specified by 'wa' and 'wb',
-respectively, and k, l represent the remaining dimensions inherited from 't'.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    rebuild2 = ops.einsum("i j k l, i p, j r -> p r k l", t, wa, wb)  # [c, d, k1, k2]
-    return rebuild2
-
-
-def make_kron(w1, w2, scale=1.0):
-    r"""
-    This function creates a Kronecker product of two input tensors w1 and w2, and then scales the result by the specified scale factor.
-    
-    Args:
-        w1 (tensor): The first input tensor.
-        w2 (tensor): The second input tensor. For 4-dimensional tensors, w1 will be modified with unsqueeze operations before computing the Kronecker product.
-        scale (float, optional): The scaling factor applied to the Kronecker product. Defaults to 1.0.
-    
-    Returns:
-        None: The function returns None.
-    
-    Raises:
-        None.
-    """
-    if len(w2.shape) == 4:
-        w1 = w1.unsqueeze(2).unsqueeze(2)
-    # w2 = w2
-    rebuild = ops.kron(w1, w2)
-
-    return rebuild * scale
diff --git a/mindnlp/peft/tuners/lokr/model.py b/mindnlp/peft/tuners/lokr/model.py
deleted file mode 100644
index 8407f5a3c..000000000
--- a/mindnlp/peft/tuners/lokr/model.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Lokr."""
-import re
-from typing import Optional, Union, Dict, Type, List
-from itertools import chain
-from tqdm import tqdm
-
-from mindnlp.core import nn
-
-from ...utils import (
-    ModulesToSaveWrapper,
-    _get_submodules,
-)
-
-from ..tuners_utils import (
-    BaseTuner,
-    BaseTunerLayer,
-    check_target_module_exists,
-)
-from .layer import Conv2d, Dense, LoKrLayer
-from .config import LoKrConfig
-
-
-class LoKrModel(BaseTuner):
-    """
-    Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in
-    https://arxiv.org/abs/2108.06098 and in https://arxiv.org/abs/2309.14859 Current implementation heavily borrows
-    from
-    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py
-
-    Args:
-        model (`mindspore.nn.Module`): The model to which the adapter tuner layers will be attached.
-        peft_config ([`LoKrConfig`]): The configuration of the LoKr model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        LoKrModel ([`mindspore.nn.Module`]): The LoKr model.
-
-    Example:
-        ```py
-        >>> from diffusers import StableDiffusionPipeline
-        >>> from peft import LoKrModel, LoKrConfig
-
-        >>> config_te = LoKrConfig(
-        ...     r=8,
-        ...     lora_alpha=32,
-        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
-        ...     rank_dropout=0.0,
-        ...     module_dropout=0.0,
-        ...     init_weights=True,
-        ... )
-        >>> config_unet = LoKrConfig(
-        ...     r=8,
-        ...     lora_alpha=32,
-        ...     target_modules=[
-        ...         "proj_in",
-        ...         "proj_out",
-        ...         "to_k",
-        ...         "to_q",
-        ...         "to_v",
-        ...         "to_out.0",
-        ...         "ff.net.0.proj",
-        ...         "ff.net.2",
-        ...     ],
-        ...     rank_dropout=0.0,
-        ...     module_dropout=0.0,
-        ...     init_weights=True,
-        ...     use_effective_conv2d=True,
-        ... )
-
-        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default")
-        >>> model.unet = LoKrModel(model.unet, config_unet, "default")
-        ```
-
-    > **Attributes**:  
-
-    >   - **model** ([`~nn.Module`])— The model to be adapted. 
-
-    >   - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr  model. 
-
-    """
-    prefix: str = "lokr_"
-    layers_mapping: Dict[Type[nn.Module], Type[LoKrLayer]] = {
-        nn.Conv2d: Conv2d,
-        nn.Linear: Dense,
-    }
-
-    def _create_and_replace(
-        self,
-        config: LoKrConfig,
-        adapter_name: str,
-        target: Union[LoKrLayer, nn.Module],
-        target_name: str,
-        parent: nn.Module,
-        current_key: str,
-        loaded_in_8bit: Optional[bool] = False,
-        loaded_in_4bit: Optional[bool] = False,
-    ) -> None:
-        """
-        A private method to create and replace the target cell with the adapter cell.
-        """
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(
-            chain(config.rank_pattern.keys(), config.alpha_pattern.keys())
-        )
-        target_name_key = next(
-            filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys),
-            target_name,
-        )
-
-        kwargs = config.to_dict()
-        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
-        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.lora_alpha)
-
-        if isinstance(target, LoKrLayer):
-            target.update_layer(adapter_name, **kwargs)
-        else:
-            new_cell = self._create_new_cell(config, adapter_name, target, **kwargs)
-            self._replace_cell(parent, target_name, new_cell, target)
-
-    @classmethod
-    def _create_new_cell(
-        cls, config: LoKrConfig, adapter_name: str, target: nn.Module, **kwargs
-    ) -> LoKrLayer:
-        r"""
-        This method creates a new LoKrLayer instance based on the provided parameters.
-        
-        Args:
-            cls (class): The class reference. It is used to access the class-level layers_mapping attribute.
-            config (LoKrConfig): The configuration object used for creating the new cell.
-            adapter_name (str): The name of the adapter to be associated with the new cell.
-            target (nn.Module): The target cell for which the new cell is being created.
-        
-        Returns:
-            LoKrLayer: Returns a new instance of LoKrLayer representing the created cell.
-        
-        Raises:
-            ValueError: If the target cell type is not supported, an exception is raised, indicating the unsupported cell type. 
-                This occurs when the target cell type does not match any of the supported cell types in the layers_mapping attribute.
-        """
-        # Find corresponding subtype of provided target cell
-        new_module_cls = None
-        for subtype, target_cls in cls.layers_mapping.items():
-            if (
-                hasattr(target, "base_layer")
-                and isinstance(target.get_base_layer(), subtype)
-                and isinstance(target, BaseTunerLayer)
-            ):
-                # nested tuner layers are allowed
-                new_module_cls = target_cls
-                break
-            elif isinstance(target, subtype):
-                new_module_cls = target_cls
-                break
-
-        # We didn't find corresponding type, so adapter for this layer is not supported
-        if new_module_cls is None:
-            supported_modules = ", ".join(
-                layer.__name__ for layer in cls.layers_mapping.keys()
-            )
-            raise ValueError(
-                f"Target cell of type {type(target)} not supported, "
-                f"currently only adapters for {supported_modules} are supported"
-            )
-
-        if isinstance(target, BaseTunerLayer):
-            target_base_layer = target.get_base_layer()
-        else:
-            target_base_layer = target
-
-        if isinstance(target_base_layer, nn.Module):
-            new_cell = new_module_cls(target, adapter_name=adapter_name, **kwargs)
-        elif isinstance(target_base_layer, nn.Module):
-            new_cell = new_module_cls(target, adapter_name=adapter_name, **kwargs)
-        else:
-            supported_modules = ", ".join(
-                layer.__name__ for layer in cls.layers_mapping.keys()
-            )
-            raise ValueError(
-                f"Target cell of type {type(target)} not supported, "
-                f"currently only adapters for {supported_modules} are supported"
-            )
-
-        return new_cell
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def _replace_cell(self, parent, child_name, new_cell, child):
-        r"""
-        Replaces a cell in the LoKrModel with a new cell.
-        
-        Args:
-            self (LoKrModel): The instance of the LoKrModel class.
-            parent: The parent object containing the cell to be replaced.
-            child_name: The name of the child object to be replaced.
-            new_cell: The new cell object to be assigned.
-            child: The child object to be replaced.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        setattr(parent, child_name, new_cell)
-
-        # child layer wraps the original cell, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        # layers with base_layer don't need the weight to be copied, as they have a reference already
-        if not hasattr(new_cell, "base_layer"):
-            new_cell.weight = child.weight
-            if hasattr(child, "bias"):
-                new_cell.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        r"""
-        The _mark_only_adapters_as_trainable method in the LoKrModel class marks only the adapters in the provided model as trainable, by setting the requires_grad attribute to False for parameters not
-containing the specified prefix.
-        
-        Args:
-            self (LoKrModel): The instance of the LoKrModel class.
-            model (nn.Module): The model for which the adapters are to be marked as trainable.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None
-        """
-        for n, p in model.parameters_and_names():
-            if self.prefix not in n:
-                p.requires_grad = False
-
-    def _set_adapter_layers(self, enabled=True):
-        r"""
-        Sets the adapter layers in the LoKrModel by enabling or disabling them.
-        
-        Args:
-            self (LoKrModel): The instance of the LoKrModel class.
-            enabled (bool, optional): Indicates whether to enable or disable the adapter layers. Defaults to True.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, (BaseTunerLayer, ModulesToSaveWrapper)):
-                cell.enable_adapters(enabled)
-
-    def _unload_and_optionally_merge(
-        self,
-        merge: bool = True,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[List[str]] = None,
-    ):
-        """
-        Method to unload and optionally merge the model.
-        
-        Args:
-            self (LoKrModel): The current instance of the LoKrModel class.
-            merge (bool): A flag indicating whether to merge the model. Defaults to True.
-            progressbar (bool): A flag indicating whether to display a progress bar. Defaults to False.
-            safe_merge (bool): A flag indicating whether to perform a safe merge. Defaults to False.
-            adapter_names (Optional[List[str]]): A list of adapter names. Defaults to None.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            ValueError: If the model is gptq quantized and merge is True, it raises a ValueError with the message 
-            "Cannot merge LOHA layers when the model is gptq quantized".
-            AttributeError: If an attribute error occurs during the method execution.
-        """
-        if merge:
-            if getattr(self.model, "quantization_method", None) == "gptq":
-                raise ValueError(
-                    "Cannot merge LOHA layers when the model is gptq quantized"
-                )
-
-        self._unloading_checks(adapter_names)
-        key_list = [
-            key for key, _ in self.model.named_modules() if self.prefix not in key
-        ]
-        desc = "Unloading " + ("and merging " if merge else "") + "model"
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-
-            if hasattr(target, "base_layer"):
-                if merge:
-                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-                self._replace_cell(
-                    parent, target_name, target.get_base_layer(), target
-                )
-            elif isinstance(target, ModulesToSaveWrapper):
-                # save any additional trainable modules part of `modules_to_save`
-                new_cell = target.modules_to_save[target.active_adapter]
-                if hasattr(new_cell, "base_layer"):
-                    # check if the cell is itself a tuner layer
-                    if merge:
-                        new_cell.merge(
-                            safe_merge=safe_merge, adapter_names=adapter_names
-                        )
-                    new_cell = new_cell.get_base_layer()
-                setattr(parent, target_name, new_cell)
-
-        return self.model
-
-    def _unloading_checks(self, adapter_names: Optional[List[str]]):
-        r"""
-        Perform unloading checks for the LoKrModel class.
-        
-        This method checks if multiple adapters with `modules_to_save` specified can be unloaded.
-        If any of the specified adapters have modules to save, unloading multiple adapters is not allowed.
-        
-        Args:
-            self (LoKrModel): An instance of the LoKrModel class.
-            adapter_names (Optional[List[str]]): A list of adapter names to consider for unloading. If not provided, all active adapters will be considered.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If multiple adapters with `modules_to_save` specified are attempted to be unloaded.
-        
-        """
-        adapters_to_consider = adapter_names or self.active_adapters
-        is_modules_to_save_available = any(
-            self.peft_config[adapter].modules_to_save
-            for adapter in adapters_to_consider
-        )
-        if is_modules_to_save_available and len(adapters_to_consider) > 1:
-            raise ValueError(
-                "Cannot unload multiple adapters that specify `modules_to_save`."
-            )
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        r"""
-        Prepare adapter configuration based on PEFT and model configurations.
-        
-        Args:
-            peft_config (object): The configuration object for PEFT.
-                It should contain information about the target modules.
-                Required parameter. Must not be None.
-            model_config (object): The configuration object for the model.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If `target_modules` is not specified in `peft_config`.
-        """
-        if peft_config.target_modules is None:
-            raise ValueError("Please specify `target_modules` in `peft_config`")
-        return peft_config
-
-    @staticmethod
-    def _check_target_module_exists(LoKR_config, key):
-        r"""
-        Checks if a target cell exists in the LoKR configuration.
-        
-        Args:
-            LoKR_config (dict): The LoKR configuration dictionary containing information about the target modules.
-            key (str): The key corresponding to the target cell to be checked.
-        
-        Returns:
-            None. Returns None if the target cell exists in the LoKR configuration; otherwise, raises an exception.
-        
-        Raises:
-            This method does not raise any exceptions explicitly. However, if the target cell does not exist in the LoKR configuration, further handling may be required based on the context in which this
-method is used.
-        """
-        return check_target_module_exists(LoKR_config, key)
diff --git a/mindnlp/peft/tuners/lora/__init__.py b/mindnlp/peft/tuners/lora/__init__.py
deleted file mode 100644
index 56606cccf..000000000
--- a/mindnlp/peft/tuners/lora/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lora"""
-from .config import LoftQConfig, LoraConfig
-from .layer import Conv2d, Embedding, Linear, LoraLayer
-from .model import LoraModel
-
-
-__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel"]
diff --git a/mindnlp/peft/tuners/lora/config.py b/mindnlp/peft/tuners/lora/config.py
deleted file mode 100644
index 242879ca6..000000000
--- a/mindnlp/peft/tuners/lora/config.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lora config"""
-from __future__ import annotations
-import warnings
-from dataclasses import dataclass, field
-from typing import Optional, Union
-try:
-    from typing import Literal
-except:
-    from typing_extensions import Literal
-
-from mindnlp.core import nn
-from ...config import PeftConfig
-from ...utils import PeftType
-
-
-@dataclass
-class LoftQConfig:
-    """
-    This is the sub-configuration class to store the configuration of a [`LoraModel`].
-
-    Args:
-        bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the
-            default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}.
-        bits (`int`): Quantization bits for LoftQ.
-        iter (`int`): Alternating iterations for LoftQ.
-        fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear
-            models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4
-            bits.
-    """
-    loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"})
-    loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"})
-
-
-@dataclass
-class LoraConfig(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a [`LoraModel`].
-
-    Args:
-        r (`int`):
-            Lora attention dimension (the "rank").
-        target_modules (`Optional[Union[List[str], str]]`):
-            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
-            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
-            strings, either an exact match will be performed or it is checked if the name of the module ends with any
-            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
-            excluding the output layer. If this is not specified, modules will be chosen according to the model
-            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
-            the target modules manually.
-        lora_alpha (`int`):
-            The alpha parameter for Lora scaling.
-        lora_dropout (`float`):
-            The dropout probability for Lora layers.
-        fan_in_fan_out (`bool`):
-            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
-            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
-        bias (`str`):
-            Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases
-            will be updated during training. Be aware that this means that, even when disabling the adapters, the model
-            will not produce the same output as the base model would have without adaptation.
-        use_rslora (`bool`):
-            When set to True, uses <a href='https://doi.org/10.48550/arXiv.2312.03732'>Rank-Stabilized LoRA</a> which
-            sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better.
-            Otherwise, it will use the original default value of `lora_alpha/r`.
-        modules_to_save (`List[str]`):
-            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
-        init_lora_weights (`bool` | `Literal["gaussian", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"]`):
-            How to initialize the weights of the adapter layers. Passing True (default) results in the default
-            initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian
-            initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to
-            completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. Pass
-            `'olora'` to use OLoRA initialization. Passing `'pissa'` results in the initialization of <a
-            href='https://arxiv.org/abs/2404.02948'>Principal Singular values and Singular vectors Adaptation
-            (PiSSA)</a>, which converges more rapidly than LoRA and ultimately achieves superior performance. Moreover,
-            PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. Passing
-            `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, where `[number of iters]`
-            indicates the number of subspace iterations to perform FSVD, and must be a nonnegative integer. When
-            `[number of iters]` is set to 16, it can complete the initialization of a 7B model within seconds, and the
-            training effect is approximately equivalent to using SVD.
-        layers_to_transform (`Union[List[int], int]`):
-            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
-            that are specified in this list. If a single integer is passed, it will apply the transformations on the
-            layer at this index.
-        layers_pattern (`str`):
-            The layer pattern name, used only if `layers_to_transform` is different from `None`.
-        rank_pattern (`dict`):
-            The mapping from layer names or regexp expression to ranks which are different from the default rank
-            specified by `r`.
-        alpha_pattern (`dict`):
-            The mapping from layer names or regexp expression to alphas which are different from the default alpha
-            specified by `lora_alpha`.
-        megatron_config (`Optional[dict]`):
-            The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can
-            get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron.
-            The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this
-            parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron.
-        megatron_core (`Optional[str]`):
-            The core module from Megatron to use, defaults to `"megatron.core"`.
-        loftq_config (`Optional[LoftQConfig]`):
-            The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights
-            and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a
-            quantized model in this case, as LoftQ will quantize the model itself.
-        use_dora (`bool`):
-            Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights
-            into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is
-            handled by a separate learnable parameter. This can improve the performance of LoRA especially at low
-            ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure
-            LoRA, so it is recommended to merge weights for inference. For more information, see
-            https://arxiv.org/abs/2402.09353.
-        layer_replication (`List[Tuple[int, int]]`):
-            Build a new stack of layers by stacking the original model layers according to the ranges specified. This
-            allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will
-            all have separate LoRA adapters attached to them.
-    """
-
-    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
-    target_modules: Optional[Union[list[str], str]] = field(
-        default=None,
-        metadata={
-            "help": (
-                "List of module names or regex expression of the module names to replace with LoRA."
-                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'."
-                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
-                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
-                "not known, an error will be raised -- in this case, you should specify the target modules manually."
-            ),
-        },
-    )
-    lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"})
-    lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"})
-    fan_in_fan_out: bool = field(
-        default=False,
-        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
-    )
-    bias: Literal["none", "all", "lora_only"] = field(
-        default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"}
-    )
-    use_rslora: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "When set to True, uses <a href='https://doi.org/10.48550/arXiv.2312.03732'>Rank-Stabilized LoRA</a>"
-                " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it"
-                " was proven to work better. Otherwise, it will use the original default"
-                " value of `lora_alpha/r`."
-            )
-        },
-    )
-    modules_to_save: Optional[list[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-    init_lora_weights: bool | Literal["gaussian", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"] = field(
-        default=True,
-        metadata={
-            "help": (
-                "How to initialize the weights of the LoRA layers. Passing `'True'` (default) results in the default "
-                "initialization from the reference implementation from Microsoft. Passing `'gaussian'` results "
-                "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
-                "to `'False'` leads to completely random initialization and *is discouraged.*"
-                "Passing `'olora'` results in OLoRA initialization."
-                "Passing `'pissa'` results in PiSSA initialization."
-                "Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, "
-                "where [number of iters] indicates the number of subspace iterations to perform fsvd, and must be a nonnegative integer."
-                "Pass `'loftq'` to use LoftQ initialization"
-            ),
-        },
-    )
-    layers_to_transform: Optional[Union[list[int], int]] = field(
-        default=None,
-        metadata={
-            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. " \
-                    "If a single integer is passed, PEFT will transform only the layer at this index. "
-            "This only works when target_modules is a list of str."
-        },
-    )
-    layers_pattern: Optional[Union[list[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
-            "This only works when target_modules is a list of str."
-        },
-    )
-    rank_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
-            )
-        },
-    )
-    alpha_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
-            )
-        },
-    )
-    megatron_config: Optional[dict] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer."
-                "You can get it like this, `core_transformer_config_from_args(get_args())`, "
-                "these two functions being from Megatron."
-                "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and "
-                "RowParallelLinear layers of megatron."
-                "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` "
-                "functions, because TransformerConfig may not necessarily be serialized."
-                "But when using megatron, we can use `get_peft_model_state_dict` function and "
-                "megatron's framework, they can also save and load models and configurations."
-            )
-        },
-    )
-    megatron_core: Optional[str] = field(
-        default="megatron.core",
-        metadata={
-            "help": (
-                "The core module from Megatron, it is used to create LoRA's parallel linear layer. "
-                "It only needs to be passed in when you need to use your own modified megatron core module. "
-                "Otherwise, it will use the default value `megatron.core`. "
-            )
-        },
-    )
-    # dict type is used when loading config.json
-    loftq_config: Union[LoftQConfig, dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone "
-                "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case."
-            )
-        },
-    )
-    use_dora: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Enable <a href='https://arxiv.org/abs/2402.09353'>'Weight-Decomposed Low-Rank Adaptation' (DoRA)</a>. This technique decomposes the updates of the "
-                "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the "
-                "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, "
-                "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger"
-                "overhead than pure LoRA, so it is recommended to merge weights for inference."
-            )
-        },
-    )
-    # Enables replicating layers in a model to expand it to a larger model.
-    layer_replication: Optional[list[tuple[int, int]]] = field(
-        default=None,
-        metadata={
-            "help": (
-                "This enables using LoRA to effectively expand a transformer model to a larger size by repeating some layers. "
-                "The transformation handles models (currently Llama, Bert or Falcon compatible architectures) with "
-                "a module list in the model which it modifies to expand the number of modules. "
-                "Base weights are shared so the memory usage is close to the original model. The intended use is these base weights "
-                "remain fixed during finetuning but each layer has a separate LoRA adapter so the layers can be specialed via "
-                "the adapter layers fit during fine tuning."
-                "The format is a list of [start, end) pairs which specify the layer ranges to stack. For example:\n"
-                "   Original model has 5 layers labelled by their position in the model: `[0, 1, 2, 3, 4]`\n"
-                "   layer_replication: `[[0, 4], [2, 5]]`\n"
-                "   Final model will have this arrangement of original layers: `[0, 1, 2, 3, 2, 3, 4]`\n"
-                "This format is based on what is used for pass-through merges in mergekit. It makes it simple to select sequential "
-                "ranges of a model and stack them while reusing layers at either end of each sequence."
-            )
-        },
-    )
-
-    def to_dict(self):
-        """
-        Returns the configuration for your adapter model as a dictionary. Removes runtime configurations.
-        """
-        rv = super().to_dict()
-        rv.pop("runtime_config")
-        return rv
-
-    def __post_init__(self):
-        self.peft_type = PeftType.LORA
-        self.target_modules = (
-            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
-        )
-        # if target_modules is a regex expression, then layers_to_transform should be None
-        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
-            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
-
-        # if target_modules is a regex expression, then layers_pattern should be None
-        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
-            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
-
-        if self.use_dora and self.megatron_config:
-            raise ValueError("DoRA does not support megatron_core, please set `use_dora=False`.")
-
-        # handle init_lora_weights and loftq_config
-        if self.init_lora_weights == "loftq":
-            import importlib
-
-            if not importlib.util.find_spec("scipy"):
-                raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
-            if self.loftq_config is None:
-                raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.")
-
-        # Using post training conversion of modified base weights to restore their initial values (PiSSA, OLoRA) cannot
-        # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends
-        # this when they'll eventually call save_pretrained (i.e. if they'll pass
-        # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here.
-        if (
-            self.use_rslora
-            and (self.rank_pattern or self.alpha_pattern)
-            and (
-                (isinstance(self.init_lora_weights, str) and (self.init_lora_weights.startswith("pissa")))
-                or (self.init_lora_weights == "olora")
-            )
-        ):
-            msg = (
-                "Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion of modified "
-                "base weights (PiSSA, OLoRA) means that you won't be able to pass "
-                "`path_initial_model_for_weight_conversion` to `save_pretrained` to restore the initial values of the "
-                "base weights; if you intend to do this, please ensure not to use rslora or rank_pattern/alpha_pattern."
-            )
-            warnings.warn(msg)
-
-        # convert loftq_config to dict
-        if self.loftq_config and not isinstance(self.loftq_config, dict):
-            self.loftq_config = vars(self.loftq_config)
-
-        self._custom_modules: Optional[dict[type[nn.Mmodule], type[nn.Module]]] = None
-
-    def _register_custom_module(self, mapping: dict[type[nn.Mmodule], type[nn.Module]]) -> None:
-        """
-        Experimental API to support providing custom LoRA layers.
-
-        This API is subject to change, you should carefully read the docs before deciding to use it:
-
-        https://huggingface.co/docs/peft/developer_guides/custom_models
-
-        To register custom LoRA module types, call this method with a `mapping` argument that is a dict that maps from
-        the target layer type to the custom LoRA layer type. The dict can contain multiple items if you wish to target
-        multiple layer types. The target layer type can be any nn.Module that we currently don't support in PEFT,
-        whether that is an official PyTorch layer type or a custom layer type. The custom LoRA module class has to be
-        implemented by the user and follow the PEFT conventions for LoRA layers.
-
-        """
-        if self._custom_modules is None:
-            self._custom_modules = {}
-        self._custom_modules.update(mapping)
diff --git a/mindnlp/peft/tuners/lora/dora.py b/mindnlp/peft/tuners/lora/dora.py
deleted file mode 100644
index ed6c7855a..000000000
--- a/mindnlp/peft/tuners/lora/dora.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2024-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""dora for peft"""
-from copy import deepcopy
-import mindspore
-from mindnlp.core.nn import functional as F
-from mindnlp.core import nn
-from mindnlp.core import ops
-from ...utils.other import transpose
-
-
-class DoraLinearLayer(nn.Module):
-    def __init__(self, fan_in_fan_out):
-        super().__init__()
-        self.fan_in_fan_out = fan_in_fan_out
-    def get_weight_norm(self, weight, lora_weight, scaling) -> mindspore.Tensor:
-        # calculate L2 norm of weight matrix, column-wise
-        weight = transpose(weight, self.fan_in_fan_out)
-        weight = weight + scaling * lora_weight
-        weight_norm = ops.norm(weight, dim=1).to(weight.dtype)
-        return weight_norm
-    def update_layer(self, *, base_layer, lora_A, lora_B, scaling, place_on_cpu=False) -> None:
-        # temporarily convert fp16 to fp32, as fp16 can cause trouble on CPU with PyTorch < 2.2
-        dtype_is_fp16 = lora_A.dtype == mindspore.float16
-        if dtype_is_fp16:
-            lora_A = lora_A.float()
-            lora_B = lora_B.float()
-        if base_layer.__class__.__name__ == "Linear4bit":
-            # We have to create a copy of the base layer, otherwise, FSDP will throw an error. 8bit does not work
-            # yet because Int8Params cannot be correctly deep-copied (attributes vanish)
-            base_layer = deepcopy(base_layer)
-        weight = base_layer.weight
-        if weight.data.ndim == 4:  # For handling LoRAs applied to Conv2Ds.
-            lora_weight = ops.mm(lora_B.flatten(start_dim=1), lora_A.flatten(start_dim=1))
-            lora_weight = lora_weight.reshape(weight.shape)
-        else:
-            lora_weight = lora_B @ lora_A
-        weight_norm = self.get_weight_norm(weight, lora_weight, scaling)
-        self.weight = nn.Parameter(weight_norm, requires_grad=True)
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        lora_result = lora_B(lora_A(x))
-
-        # Don't use `lora_weight = lora_B.weight @ lora_A.weight` because this causes errors with FSDP. Instead,
-        # calculate the same but using forward.
-        x_eye = ops.eye(lora_A.weight.shape[1], dtype=x.dtype)
-        lora_weight = lora_B(lora_A(x_eye)).T
-
-        magnitude = self.weight
-        weight = base_layer.weight
-        weight = weight.to(x.dtype)
-        weight_norm = self.get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        # weight_norm = weight_norm.detach()
-        mag_norm_scale = (magnitude / weight_norm).view(1, -1)
-        result_dora = (mag_norm_scale - 1) * (
-            F.linear(x, transpose(weight, self.fan_in_fan_out))
-        ) + mag_norm_scale * lora_result * scaling
-
-        # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
-        # This is only correct if dropout=0, otherwise results will differ:
-        # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771
-        # bias = self.get_base_layer().bias
-        # if bias is not None:
-        #     result = result - bias
-        # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling
-        # if bias is not None:
-        #     result = result + bias
-
-        return result_dora
-
-    def __repr__(self) -> str:
-        rep = super().__repr__()
-        return "lora.dora." + rep
-
-
-class DoraEmbeddingLayer(DoraLinearLayer):
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer, embed_fn):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        lora_weight = (lora_A @ lora_B).T
-        magnitude = self.weight
-        weight = base_layer.weight
-        weight_norm = self.get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        # weight_norm = weight_norm.detach()
-        mag_norm_scale = magnitude / weight_norm
-        result_dora = mag_norm_scale * (embed_fn(x, lora_A) @ lora_B) * scaling
-        return mag_norm_scale, result_dora
-
-    def __repr__(self) -> str:
-        rep = super().__repr__()
-        return "lora.dora." + rep
-
-
-class DoraConv2dLayer(DoraLinearLayer):
-    def get_weight_norm(self, weight, lora_weight, scaling) -> mindspore.Tensor:
-        # calculate L2 norm of weight matrix, column-wise
-        weight = weight + scaling * lora_weight
-        # the following is needed to have compatibility with the 4D weight tensors of Conv2D
-        weight_norm = weight.norm(p=2, dim=(1, 2, 3), keepdim=True).transpose(1, 0)
-        return weight_norm
-
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        weight = base_layer.weight
-        lora_weight = ops.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
-        lora_weight = lora_weight.reshape(weight.shape)
-        magnitude = self.weight
-        weight_norm = self.get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        # weight_norm = weight_norm.detach()
-        mag_norm_scale = magnitude / weight_norm
-        result_dora = (mag_norm_scale - 1) * (
-            F.conv2d(
-                x,
-                weight,
-                bias=None,
-                stride=base_layer.stride,
-                padding=base_layer.padding,
-                dilation=base_layer.dilation,
-                groups=base_layer.groups,
-            )
-        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
-
-        return result_dora
-
-    def __repr__(self) -> str:
-        rep = super().__repr__()
-        return "lora.dora." + rep
diff --git a/mindnlp/peft/tuners/lora/layer.py b/mindnlp/peft/tuners/lora/layer.py
deleted file mode 100644
index 5579210c8..000000000
--- a/mindnlp/peft/tuners/lora/layer.py
+++ /dev/null
@@ -1,1519 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lora layer"""
-from __future__ import annotations
-
-import math
-import warnings
-from typing import Any, Optional, Union
-
-import mindspore
-from mindnlp.core.nn import Parameter
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import ParameterDict, functional as F
-from ....transformers.ms_utils import Conv1D
-from ..tuners_utils import BaseTunerLayer, check_adapters_to_merge
-from ...utils.other import transpose
-from .dora import DoraConv2dLayer, DoraLinearLayer
-from .config import LoraConfig
-
-
-class LoraLayer(BaseTunerLayer):
-
-    r"""
-    The `LoraLayer` class represents a layer that implements LOcal Response Adjustment (LORA) for neural network models. It inherits from the `BaseTunerLayer` class and provides methods for updating and
-scaling the layer's parameters, as well as performing mixed batch forward operations.
-    
-    Attributes:
-        base_layer (nn.Module): The base layer used for computation.
-        r (dict): Dictionary of adapter names and associated integer values representing the r parameter in LORA.
-        lora_alpha (dict): Dictionary of adapter names and associated float values representing the alpha parameter in LORA.
-        scaling (dict): Dictionary of adapter names and associated float values representing the scaling factor in LORA.
-        lora_dropout (nn.ModuleDict): Dictionary of adapter names and associated dropout layers used in LORA.
-        lora_A (nn.ModuleDict): Dictionary of adapter names and associated nn.Linear layers used in LORA for input transformation.
-        lora_B (nn.ModuleDict): Dictionary of adapter names and associated nn.Linear layers used in LORA for output transformation.
-        lora_embedding_A (ParameterDict): Dictionary of adapter names and associated parameter dictionaries used in LORA for input embedding.
-        lora_embedding_B (ParameterDict): Dictionary of adapter names and associated parameter dictionaries used in LORA for output embedding.
-        _disable_adapters (bool): Boolean flag indicating whether adapters are disabled.
-        merged_adapters (list): List of merged adapters.
-        use_dora (dict): Dictionary of adapter names and associated boolean values indicating whether DoRA (Distributed Orthogonal Random Access) is enabled.
-        lora_magnitude_vector (Optional[ParameterDict]): Optional parameter dictionary for storing the magnitude vector in LORA.
-        _caches (dict): Dictionary for caching intermediate values during computation.
-        kwargs (dict): Additional keyword arguments.
-    
-    Methods:
-        update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora=False): Updates the LORA layer with the specified adapter parameters.
-        reset_lora_parameters(adapter_name, init_lora_weights): Resets the LORA layer parameters based on the specified initialization method.
-        _get_weight_norm(weight, lora_weight, scaling): Computes the normalized weight using LORA parameters.
-        _cache_store(key, value): Stores a value in the cache.
-        _cache_pop(key): Retrieves and removes a value from the cache.
-        set_scale(adapter, scale): Sets the scaling factor for a specific adapter.
-        scale_layer(scale): Scales the layer by the specified factor.
-        unscale_layer(scale=None): Unscales the layer by the specified factor or to its original scaling.
-        _check_forward_args(x, *args, **kwargs): Checks the compatibility of arguments with the model's configuration and state.
-        _mixed_batch_forward(x, *args, adapter_names, **kwargs): Performs a mixed batch forward operation considering the specified adapter names.
-    
-    Raises:
-        ValueError: If unsupported layer types or incorrect adapter configurations are encountered.
-    """
-    # All names of layers that may contain (trainable) adapter weights
-    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
-    # All names of other parameters that may contain adapter-related parameters
-    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
-
-    def __init__(self, base_layer: nn.Module,ephemeral_gpu_offload: bool = False, **kwargs) -> None:
-        r"""
-        __init__
-        
-        This method initializes the LoraLayer class.
-        
-        Args:
-            self: LoraLayer object
-                The instance of the LoraLayer class.
-            base_layer: nn.Module
-                The base layer to be used for the LoraLayer. It can be an instance of nn.Linear, nn.Conv2d, nn.Embedding, Conv1D, or other supported layer types.
-        
-        Returns:
-            None
-            This method does not return any value.
-        
-        Raises:
-            ValueError
-                If the base_layer type is not supported or recognized.
-        """
-        self.base_layer = base_layer
-        self.r = {}
-        self.lora_alpha = {}
-        self.scaling = {}
-        self.lora_dropout = nn.ModuleDict({})
-        self.lora_A = nn.ModuleDict({})
-        self.lora_B = nn.ModuleDict({})
-        # For Embedding layer
-        self.lora_embedding_A = ParameterDict({})
-        self.lora_embedding_B = ParameterDict({})
-        # Mark the weight as unmerged
-        self._disable_adapters = False
-        self.merged_adapters = []
-        self.use_dora: dict[str, bool] = {}
-        self.lora_magnitude_vector = nn.ModuleDict()  # for DoRA
-        self._caches: dict[str, Any] = {}
-        self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload
-        self.kwargs = kwargs
-
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif isinstance(base_layer, nn.Conv2d):
-            in_features, out_features = base_layer.in_channels, base_layer.out_channels
-        elif isinstance(base_layer, nn.Embedding):
-            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
-        elif isinstance(base_layer, Conv1D):
-            in_features, out_features = (
-                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
-            )
-        elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"):
-            # QuantLinear
-            in_features, out_features = base_layer.infeatures, base_layer.outfeatures
-        elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"):
-            # Megatron ColumnParallelLinear,RowParallelLinear
-            in_features, out_features = base_layer.input_size, base_layer.output_size
-        elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear":
-            # AQLM QuantLinear
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM":
-            # Awq layers
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif base_layer.__class__.__name__ == "EetqLinear":
-            # Eetq layers
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        elif hasattr(base_layer, "W_q") and base_layer.__class__.__name__ == "HQQLinear":
-            # HQQ layers
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        else:
-            raise ValueError(f"Unsupported layer type {type(base_layer)}")
-
-        self.in_features = in_features
-        self.out_features = out_features
-
-    def update_layer(
-        self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora: bool = False
-    ):
-        r"""
-        Update the layer configuration for the specified adapter in the LoraLayer class.
-        
-        Args:
-            self (LoraLayer): The LoraLayer instance.
-            adapter_name (str): The name of the adapter to be updated.
-            r (int): The number of units in the layer. Should be a positive integer.
-            lora_alpha (float): The alpha value for Lora scaling.
-            lora_dropout (float): The dropout rate for the Lora layer. Should be in the range [0.0, 1.0].
-            init_lora_weights (str or bool): The method for initializing Lora weights. Can be 'loftq' or a boolean value.
-            use_rslora (bool): Flag indicating whether to use RS-Lora scaling.
-            use_dora (bool, optional): Flag indicating whether to use Dora. Defaults to False.
-        
-        Returns:
-            None. The method updates the internal state of the LoraLayer instance.
-        
-        Raises:
-            ValueError: If the value of 'r' is not a positive integer.
-        """
-        # This code works for linear layers, override for other layer types
-        if r <= 0:
-            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
-
-        self.r[adapter_name] = r
-        self.lora_alpha[adapter_name] = lora_alpha
-        if lora_dropout > 0.0:
-            lora_dropout_layer = nn.Dropout(p=lora_dropout)
-        else:
-            lora_dropout_layer = nn.Identity()
-
-        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
-        # Actual trainable parameters
-        self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
-        self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
-        if use_rslora:
-            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
-        else:
-            self.scaling[adapter_name] = lora_alpha / r
-
-        if init_lora_weights == "loftq":
-            self.loftq_init(adapter_name)
-        elif init_lora_weights:
-            self.reset_lora_parameters(adapter_name, init_lora_weights)
-
-        # check weight and qweight (for GPTQ)
-        for weight_name in ("weight", "qweight"):
-            weight = getattr(self.get_base_layer(), weight_name, None)
-            if weight is not None:
-                # the layer is already completely initialized, this is an update
-                if ops.is_floating_point(weight) or ops.is_complex(weight):
-                    for param in self.parameters():
-                        param.assign_value(param.astype(weight.dtype))
-                break
-
-        if use_dora:
-            self.dora_init(adapter_name)
-
-            self.use_dora[adapter_name] = True
-        else:
-            self.use_dora[adapter_name] = False
-
-        self.set_adapter(self.active_adapters)
-
-    def reset_lora_parameters(self, adapter_name, init_lora_weights):
-        r"""
-        Reset the LoRa parameters for a given adapter.
-        
-        Args:
-            self (object): The instance of the LoraLayer class.
-            adapter_name (str): The name of the LoRa adapter for which parameters need to be reset.
-            init_lora_weights (bool/str): Specifies the type of initialization for LoRa weights.
-                If False, no initialization is performed.
-                If True, HeUniform initialization with sqrt(5) is applied.
-                If 'gaussian', Normal initialization with a scale of 1 divided by r[adapter_name] is used.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If the init_lora_weights parameter is not recognized or has an unsupported value.
-        """
-        if init_lora_weights is False:
-            return
-
-        if adapter_name in self.lora_A.keys():
-            if init_lora_weights is True:
-                # initialize A the same way as the default for nn.Linear and B to zero
-                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
-                nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
-            elif init_lora_weights.lower() == "gaussian":
-                nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
-            else:
-                raise ValueError(f"Unknown initialization {init_lora_weights}")
-            nn.init.zeros_(self.lora_B[adapter_name].weight)
-        if adapter_name in self.lora_embedding_A.keys():
-            # initialize a the same way as the default for nn.Linear and b to zero
-            nn.init.zeros_(self.lora_embedding_A[adapter_name])
-            nn.init.normal_(self.lora_embedding_B[adapter_name])
-
-    def dora_init(self, adapter_name: str) -> None:
-        if not self.lora_magnitude_vector:
-            # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters
-            self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
-
-        dora_layer = DoraLinearLayer(fan_in_fan_out=getattr(self, "fan_in_fan_out", False))
-        lora_A = self.lora_A[adapter_name].weight
-        lora_B = self.lora_B[adapter_name].weight
-        place_on_cpu = self.ephemeral_gpu_offload and (lora_A.device.type == "cpu" or lora_B.device.type == "cpu")
-        # if self.ephemeral_gpu_offload:
-        #     if lora_A.device.type in ["cuda", "xpu"]:
-        #         lora_B = lora_B.to(lora_A.device)
-        #     else:
-        #         if lora_B.device.type not in ["cuda", "xpu"]:
-        #             if is_xpu_available():
-        #                 lora_B = lora_B.to("xpu")
-        #             else:
-        #                 lora_B = lora_B.to("cuda")
-        #         lora_A = lora_A.to(lora_B.device)
-        scaling = self.scaling[adapter_name]
-        dora_layer.update_layer(
-            base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling, place_on_cpu=place_on_cpu
-        )
-        self.lora_magnitude_vector[adapter_name] = dora_layer
-
-    def _get_weight_norm(self, weight, lora_weight, scaling) -> mindspore.Tensor:
-        r"""
-        This method calculates the normalized weight for the LoraLayer.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            weight (mindspore.Tensor): The weight tensor to be normalized.
-            lora_weight (mindspore.Tensor): The Lora weight tensor to be added to the weight.
-            scaling (float): The scaling factor to be applied to the lora_weight before adding to the weight.
-        
-        Returns:
-            mindspore.Tensor: The normalized weight tensor after applying the LoraLayer normalization process.
-        
-        Raises:
-            ValueError: If the weight or lora_weight tensors are invalid or incompatible for normalization.
-            TypeError: If the input types are not as expected.
-        """
-        # calculate L2 norm of weight matrix, column-wise
-        weight = transpose(weight, self.fan_in_fan_out)
-        weight = weight + scaling * lora_weight
-        weight_norm = F.normalize(weight, dim=1).to(weight.dtype)
-        return weight_norm
-
-    def _cache_store(self, key: str, value: Any) -> None:
-        r"""
-        Method _cache_store in the LoraLayer class.
-        
-        This method stores a key-value pair in the cache.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            key (str): The key for the cache entry.
-            value (Any): The value to be stored in the cache.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        self._caches[key] = value
-
-    def _cache_pop(self, key: str) -> Any:
-        r"""
-        Method _cache_pop in class LoraLayer.
-        
-        This method is responsible for popping the value associated with the specified key from the cache.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            key (str): The key for which the associated value needs to be popped from the cache.
-        
-        Returns:
-            Any: The value associated with the specified key in the cache.
-        
-        Raises:
-            KeyError: If the specified key is not present in the cache.
-            Exception: Any other unexpected exceptions during the operation.
-        """
-        value = self._caches.pop(key)
-        return value
-
-    def set_scale(self, adapter, scale):
-        r"""
-        This method sets the scale for a specific adapter in the LoraLayer class.
-        
-        Args:
-            self (object): The instance of the LoraLayer class.
-            adapter (str): The identifier of the adapter for which the scale is to be set.
-            scale (float): The scale value to be set for the specified adapter. It is a floating point number.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - KeyError: If the specified adapter is not found in the 'scaling' attribute of the LoraLayer instance.
-            - ZeroDivisionError: If the scale calculation involves division by zero, such as when the 'r' attribute for the specified adapter is zero.
-        """
-        if adapter not in self.scaling:
-            # Ignore the case where the adapter is not in the layer
-            return
-        self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter]
-
-    def scale_layer(self, scale: float) -> None:
-        r"""
-        Scale the layer by a specified factor.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            scale (float): The scaling factor to be applied to the layer. Must be a float value.
-            
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            - TypeError: If the scale parameter is not a float.
-        """
-        if scale == 1:
-            return
-
-        for active_adapter in self.active_adapters:
-            if active_adapter not in self.lora_A.keys():
-                continue
-
-            self.scaling[active_adapter] *= scale
-
-    def unscale_layer(self, scale=None) -> None:
-        r"""
-        This method unscales a layer by either calculating a new scaling factor or dividing the current scaling factor by a specified scale value.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            scale (float, optional): The value by which to divide the current scaling factor. If set to None, a new scaling factor is calculated based on the existing values. Default is None.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - KeyError: If the active_adapter is not found in the keys of the lora_A dictionary.
-            - ZeroDivisionError: If the scale parameter is 0 and the current scaling factor needs to be divided by it.
-        """
-        for active_adapter in self.active_adapters:
-            if active_adapter not in self.lora_A.keys():
-                continue
-
-            if scale is None:
-                self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter]
-            else:
-                self.scaling[active_adapter] /= scale
-
-    def _check_forward_args(self, x, *args, **kwargs):
-        """Check if the arguments are compatible with the configs and state of the model"""
-        adapter_names = kwargs.get("adapter_names", None)
-        if adapter_names is None:
-            return
-
-        if len(x) != len(adapter_names):
-            msg = (
-                "Length of `adapter_names` should be the same as the number of inputs, but got "
-                f"{len(adapter_names)} and {len(x)} respectively."
-            )
-            raise ValueError(msg)
-
-        if self.merged:
-            # It is unclear what would be the right thing to do if users pass adapter_names and there are merged
-            # adapters. Therefore, it is better to raise an error in this case.
-            msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first."
-            raise ValueError(msg)
-
-        unique_adapters = set(self.active_adapters)
-        for adapter_name in unique_adapters:
-            if self.use_dora.get(adapter_name, False):
-                msg = "Cannot pass `adapter_names` when DoRA is enabled."
-                raise ValueError(msg)
-
-    def _mixed_batch_forward(
-        self, x: mindspore.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
-    ) -> mindspore.Tensor:
-        r""" 
-        This method '_mixed_batch_forward' is defined in the class 'LoraLayer' and is responsible for performing mixed batch forward propagation.
-        
-        Args:
-            self (LoraLayer): The instance of the LoraLayer class.
-            x (mindspore.Tensor): The input tensor for the forward propagation.
-        
-        Returns:
-            mindspore.Tensor: The output tensor after the forward propagation.
-        
-        Raises:
-            - KeyError: If the specified active_adapter is not found in the self.lora_A keys.
-            - TypeError: If the input parameters are not of the expected types.
-            - IndexError: If there is an index error while accessing the sub_batch_indices_list.
-        
-        """
-        # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
-        # extra argument that allows mixing different adapters in the same batch at inference time.
-        result = self.base_layer(x, *args, **kwargs)
-        torch_result_dtype = result.dtype
-
-        unique_adapters = set(adapter_names)
-        sub_batch_indices_list = []
-        for adapter in unique_adapters:
-            sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])
-
-        for i, active_adapter in enumerate(unique_adapters):
-            if active_adapter == "__base__":
-                continue
-            if active_adapter not in self.lora_A.keys():
-                continue
-
-            lora_A = self.lora_A[active_adapter]
-            lora_B = self.lora_B[active_adapter]
-            dropout = self.lora_dropout[active_adapter]
-            scaling = self.scaling[active_adapter]
-
-            # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
-            # layer output
-            sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype)
-            lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling
-            result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype)
-
-        return result
-
-
-# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
-# and modified to work with PyTorch FSDP
-
-
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-
-
-class Linear(nn.Module, LoraLayer):
-
-    r"""
-    The Linear class represents a customizable linear layer with support for LoRA (Learned Optimizer Rate Annealing) adapters. This class inherits from the nn.Module and LoraLayer classes. 
-    
-    The class includes methods for initializing the layer, merging and unmerging adapter weights, computing delta weights for adapters, forwarding the layer's forward pass, and generating a string
-representation of the class.
-    
-    The __init__ method initializes the Linear layer with specified parameters and configures the LoRA adapters. The merge method combines the active adapter weights into the base weights, with an option to
-perform a safe merge operation. The unmerge method reverses the merge operation by unmerging all merged adapter layers from the base weights. The get_delta_weight method computes the delta weight for a given
-adapter. The forward method applies the forwarded linear layer to input data, with support for adapter-specific adjustments. The __repr__ method returns a string representation of the Linear class prefixed
-with 'lora.'.
-    """
-    # Lora implemented in a dense layer
-    def __init__(
-        self,
-        base_layer,
-        adapter_name: str,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        is_target_conv_1d_layer: bool = False,
-        init_lora_weights: Union[bool, str] = True,
-        use_rslora: bool = False,
-        use_dora: bool = False,
-        **kwargs,
-    ) -> None:
-        r"""
-        Initializes a Linear object.
-        
-        Args:
-            self: The instance of the Linear class.
-            base_layer: The base layer to be used for the Linear object.
-            adapter_name (str): The name of the adapter.
-            r (int): The value of r.
-            lora_alpha (int): The alpha value for lora.
-            lora_dropout (float): The dropout value for lora.
-            fan_in_fan_out (bool): Flag indicating if fan in fan out is enabled.
-            is_target_conv_1d_layer (bool): Flag indicating if the layer is the target conv 1D layer.
-            init_lora_weights (Union[bool, str]): Flag or string indicating if lora weights should be initialized.
-            use_rslora (bool): Flag indicating if RSLora should be used.
-            use_dora (bool): Flag indicating if Dora should be used.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        LoraLayer.__init__(self, base_layer, **kwargs)
-        self.fan_in_fan_out = fan_in_fan_out
-
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name,
-            r,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            init_lora_weights=init_lora_weights,
-            use_rslora=use_rslora,
-            use_dora=use_dora,
-        )
-        self.is_target_conv_1d_layer = is_target_conv_1d_layer
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self.lora_A.keys():
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    # Note that safe_merge will be slower than the normal merge
-                    # because of the copy operation.
-                    orig_weights = base_layer.weight.data.clone()
-                    delta_weight = self.get_delta_weight(active_adapter)
-                    if not self.use_dora[active_adapter]:
-                        orig_weights = orig_weights + delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(
-                            orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1
-                        )
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
-                        orig_weights = dora_factor * (orig_weights + delta_weight)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = orig_weights
-                else:
-                    delta_weight = self.get_delta_weight(active_adapter)
-                    if not self.use_dora[active_adapter]:
-                        base_layer.weight.data = base_layer.weight.data + delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(
-                            base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1
-                        )
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out)
-                        new_weight = dora_factor * (base_layer.weight.data + delta_weight)
-                        base_layer.weight.data = new_weight
-
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.lora_A.keys():
-                weight = self.get_base_layer().weight
-                delta_weight = self.get_delta_weight(active_adapter)
-                if not self.use_dora[active_adapter]:
-                    weight.data -= delta_weight
-                else:
-                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
-                    weight.data = weight_orig
-
-    def get_delta_weight(self, adapter) -> mindspore.Tensor:
-        """
-        Compute the delta weight for the given adapter.
-
-        Args:
-            adapter (str):
-                The name of the adapter for which the delta weight should be computed.
-        """
-        dtype = self.lora_B[adapter].weight.dtype
-        weight_A = self.lora_A[adapter].weight
-        weight_B = self.lora_B[adapter].weight
-        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
-        return output_tensor
-    def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter):
-        """
-        For DoRA on Linear layers, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        base_layer = self.get_base_layer()  # Get the base linear layer
-        weight = base_layer.weight  # Base layer's weight
-        # Compute LoRA weight
-        lora_weight = ops.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
-        lora_weight = lora_weight.reshape(weight.shape)
-        # Magnitude scaling and weight normalization
-        magnitude = self.lora_magnitude_vector[active_adapter]
-        weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
-        mag_norm_scale = magnitude / weight_norm
-        # Base layer's output
-        base_output = ops.matmul(x, weight.t())  # Linear transformation (x @ weight.T)
-        # LoRA's output
-        lora_output = lora_B(lora_A(x))
-        # Result with DoRA applied
-        result_dora = (mag_norm_scale - 1) * base_output + mag_norm_scale * lora_output * scaling
-        return result_dora
-
-
-    def forward(self, x: mindspore.Tensor, *args: Any, **kwargs: Any) -> mindspore.Tensor:
-        r"""
-        Constructs the forward pass of the Linear class.
-        
-        Args:
-            self (Linear): The instance of the Linear class.
-            x (mindspore.Tensor): The input tensor to be processed by the forward pass.
-        
-        Returns:
-            mindspore.Tensor: The output tensor resulting from the forward pass.
-        
-        Raises:
-            None.
-        """
-        self._check_forward_args(x, *args, **kwargs)
-        adapter_names = kwargs.pop("adapter_names", None)
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif adapter_names is not None:
-            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-            torch_result_dtype = result.dtype
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.lora_A.keys():
-                    continue
-                lora_A = self.lora_A[active_adapter]
-                lora_B = self.lora_B[active_adapter]
-                dropout = self.lora_dropout[active_adapter]
-                scaling = self.scaling[active_adapter]
-                x = x.to(lora_A.weight.dtype)
-                if not self.use_dora[active_adapter]:
-                    result = result + lora_B(lora_A(dropout(x))) * scaling
-                else:
-                    x = dropout(x)
-                    result = result + self.lora_magnitude_vector[active_adapter](
-                        x,
-                        lora_A=lora_A,
-                        lora_B=lora_B,
-                        scaling=scaling,
-                        base_layer=self.get_base_layer(),
-                    )
-
-            result = result.to(torch_result_dtype)
-
-        return result
-
-    def __repr__(self) -> str:
-        r"""
-        This method returns a string representation of the Linear class instance.
-        
-        Args:
-            self (Linear): The instance of the Linear class for which the string representation is being generated.
-        
-        Returns:
-            str: A string representation of the Linear class instance prefixed with 'lora.'.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        rep = super().__repr__()
-        return "lora." + rep
-
-
-class Embedding(nn.Module, LoraLayer):
-
-    r"""
-    The 'Embedding' class represents a customizable adapter layer that can be integrated into neural network architectures. It inherits functionalities from the nn.Module and LoraLayer classes, providing a
-flexible mechanism for adapting neural network behavior.
-    
-    The class includes methods for initializing the adapter layer, updating its parameters, merging adapter weights into base weights, unmerging adapter layers, computing delta weights, and performing mixed
-batch forward passes. It also allows for embedding computations and the forwardion of the adapted network output.
-    
-    The 'Embedding' class is designed to enhance neural network performance by introducing adapter layers that can adapt to specific tasks or data characteristics, offering a versatile approach to model
-adaptation and specialization.
-    """
-    # LoRA implemented in a Embedding layer
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        init_lora_weights: Union[bool, str] = True,
-        use_rslora: bool = False,
-        use_dora: bool = False,
-        **kwargs,
-    ) -> None:
-        r"""
-        Initializes an instance of the Embedding class.
-        
-        Args:
-        - self: The instance of the class.
-        - base_layer (nn.Module): The base layer to be used for initialization.
-        - adapter_name (str): The name of the adapter.
-        - r (int): The value of r.
-        - lora_alpha (int): The value of lora alpha.
-        - lora_dropout (float): The dropout rate for LORA.
-        - init_lora_weights (Union[bool, str]): Flag to initialize LORA weights.
-        - use_rslora (bool): Flag to indicate if RSLORA should be used.
-        - use_dora (bool): Flag to indicate if DORA should be used.
-        
-        Returns:
-        - None: This method does not return any value.
-        
-        Raises:
-        - ValueError: If use_dora is set to True, as the class does not support DoRA yet. It advises to set use_dora to False.
-        """
-        super().__init__()
-        LoraLayer.__init__(self, base_layer)
-
-        if use_dora:
-            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
-
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name,
-            r,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            init_lora_weights=init_lora_weights,
-            use_rslora=use_rslora,
-            use_dora=use_dora,
-        )
-
-    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora):
-        """
-        Updates the layer with the specified parameters for the given adapter.
-        
-        Args:
-            self (Embedding): The instance of the Embedding class.
-            adapter_name (str): The name of the adapter to update.
-            r (int): The positive integer value representing the dimensionality of the adapter.
-            lora_alpha (float): The alpha value for LoRA scaling.
-            lora_dropout (float): The dropout probability for the LoRA layer. Should be in the range (0.0, 1.0).
-            init_lora_weights (str or bool): The method for initializing LoRA weights. If 'loftq', initialize using loftq method. If True, reset using the provided method.
-            use_rslora (bool): True to use RSLoRA scaling, False to use regular LoRA scaling.
-            use_dora (bool): The flag to indicate whether DORA (Dynamic Operation Routing for Adapters) is used.
-        
-        Returns:
-            None. The method updates the layer in place.
-        
-        Raises:
-            ValueError: If the value of `r` is not a positive integer.
-        """
-        if r <= 0:
-            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
-
-        self.r[adapter_name] = r
-        self.lora_alpha[adapter_name] = lora_alpha
-        if lora_dropout > 0.0:
-            lora_dropout_layer = nn.Dropout(p=lora_dropout)
-        else:
-            lora_dropout_layer = nn.Identity()
-
-        self.lora_dropout[adapter_name] = lora_dropout_layer
-        # Actual trainable parameters
-        weight_A = ops.randn((r, self.in_features))
-        weight_B = ops.randn((self.out_features, r))
-        self.lora_embedding_A[adapter_name] = Parameter(weight_A)
-        self.lora_embedding_B[adapter_name] = Parameter(weight_B)
-        if use_rslora:
-            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
-        else:
-            self.scaling[adapter_name] = lora_alpha / r
-
-        if init_lora_weights == "loftq":
-            self.loftq_init(adapter_name)
-        elif init_lora_weights:
-            self.reset_lora_parameters(adapter_name, init_lora_weights)
-
-        base_layer = self.get_base_layer()
-        weight = getattr(base_layer, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            self.to(dtype=weight.dtype)
-
-        self.set_adapter(self.active_adapters)
-
-    def dora_init(self, adapter_name: str) -> None:
-        if self.lora_magnitude_vector is None:
-            # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters
-            self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
-
-        dora_layer = DoraConv2dLayer(fan_in_fan_out=False)
-        lora_A = self.lora_A[adapter_name].weight
-        lora_B = self.lora_B[adapter_name].weight
-        scaling = self.scaling[adapter_name]
-        dora_layer.update_layer(base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling)
-        self.lora_magnitude_vector[adapter_name] = dora_layer
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self.lora_embedding_A.keys():
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    # Note that safe_merge will be slower than the normal merge
-                    # because of the copy operation.
-                    orig_weights = base_layer.weight.data.clone()
-                    orig_weights = orig_weights + self.get_delta_weight(active_adapter)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = orig_weights
-                else:
-                    base_layer.weight.data = base_layer.weight.data + self.get_delta_weight(active_adapter)
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.lora_embedding_A.keys():
-                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
-
-    def get_delta_weight(self, adapter) -> mindspore.Tensor:
-        """
-        Compute the delta weight for the given adapter.
-
-        Args:
-            adapter (str):
-                The name of the adapter for which the delta weight should be computed.
-        """
-        weight_A = self.lora_embedding_A[adapter]
-        weight_B = self.lora_embedding_B[adapter]
-
-        output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter]
-
-        return output_tensor
-
-    def _mixed_batch_forward(
-        self, x: mindspore.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any
-    ) -> mindspore.Tensor:
-        r"""
-        This method '_mixed_batch_forward' is defined in the class 'Embedding' and is used to perform a mixed batch forward operation.
-        
-        Args:
-            self: The instance of the 'Embedding' class.
-            x (mindspore.Tensor): The input tensor on which the mixed batch forward operation is performed.
-        
-            *args: Variable length argument list.
-            
-            adapter_names (list[str]): A list of adapter names which are used to identify unique adapters.
-        
-            **kwargs: Variable keyword argument list.
-        
-        Returns:
-            mindspore.Tensor: Returns the result of the mixed batch forward operation as a tensor of type 'mindspore.Tensor'.
-        
-        Raises:
-            None
-        """
-        # This is a special method that handles the case when users pass the argument `adapter_names`. This is an
-        # extra argument that allows mixing different adapters in the same batch at inference time.
-        result = self.base_layer(x, *args, **kwargs)
-
-        unique_adapters = set(adapter_names)
-        sub_batch_indices_list = []
-        for adapter in unique_adapters:
-            sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter])
-
-        for i, active_adapter in enumerate(unique_adapters):
-            if active_adapter == "__base__":
-                continue
-            if active_adapter not in self.lora_embedding_A.keys():
-                continue
-
-            embedding_A = self.lora_embedding_A[active_adapter].T
-            embedding_B = self.lora_embedding_B[active_adapter].T
-            scaling = self.scaling[active_adapter]
-
-            # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
-            # layer output
-            sub_batch = x[sub_batch_indices_list[i]]
-            after_A = self._embed(sub_batch, embedding_A)
-            result[sub_batch_indices_list[i]] += (after_A @ embedding_B) * scaling
-
-        return result
-
-    def _embed(self, input: mindspore.Tensor, weight: mindspore.Tensor) -> mindspore.Tensor:
-        r"""
-        Method _embed in the class Embedding.
-        
-        This method is responsible for performing embedding using the input and weight tensors.
-        
-        Args:
-            self (Embedding): The instance of the Embedding class.
-            input (mindspore.Tensor): The input tensor containing the indices for embedding lookup.
-            weight (mindspore.Tensor): The weight tensor containing the embedding vectors.
-        
-        Returns:
-            mindspore.Tensor: A tensor resulting from the embedding lookup operation.
-        
-        Raises:
-            None
-        """
-        base_layer = self.get_base_layer()
-        return F.embedding(
-            input,
-            weight,
-            padding_idx=base_layer.padding_idx,
-            max_norm=base_layer.max_norm,
-            norm_type=base_layer.norm_type,
-            scale_grad_by_freq=base_layer.scale_grad_by_freq,
-            # sparse=base_layer.sparse,
-        )
-
-    def forward(self, x: mindspore.Tensor, *args: Any, **kwargs: Any) -> mindspore.Tensor:
-        r"""
-        Constructs the embedding layer.
-        
-        Args:
-            self (Embedding): The instance of the Embedding class.
-            x (mindspore.Tensor): The input tensor to be embedded.
-        
-        Returns:
-            mindspore.Tensor: The embedded tensor.
-        
-        Raises:
-            TypeError: If the input arguments are not of the correct type.
-            ValueError: If any of the input arguments are invalid or out of range.
-            RuntimeError: If an error occurs while embedding the tensor.
-        """
-        # TODO: no dtype conversion here, unlike in Linear, is that correct?
-        self._check_forward_args(x, *args, **kwargs)
-        adapter_names = kwargs.pop("adapter_names", None)
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif adapter_names is not None:
-            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-            torch_result_dtype = result.dtype
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.lora_embedding_A:
-                    continue
-                embedding_A = self.lora_embedding_A[active_adapter].T
-                embedding_B = self.lora_embedding_B[active_adapter].T
-                scaling = self.scaling[active_adapter]
-                after_A = self._embed(x, embedding_A)
-                result = result + (after_A @ embedding_B) * scaling
-            result = result.to(torch_result_dtype)
-
-        return result
-
-    def __repr__(self) -> str:
-        r"""
-        This method '__repr__' in the class 'Embedding' generates a string representation of the object.
-        
-        Args:
-            self: An instance of the Embedding class.
-                Purpose: Represents the current instance of the Embedding class.
-                Restrictions: None.
-        
-        Returns:
-            str: A string representation of the object.
-                Purpose: Provides a textual representation of the object for debugging and logging purposes.
-        
-        Raises:
-            None.
-        """
-        rep = super().__repr__()
-        return "lora." + rep
-
-
-class Conv2d(nn.Module, LoraLayer):
-
-    r"""
-    Represents a custom Conv2d class that incorporates LoRA (Locally Recurrent Adaptive) functionality for adaptive learning in neural networks. This class inherits from the nn.Module and LoraLayer classes.
-    
-    Attributes:
-        - base_layer (nn.Module): The base layer for the Conv2d operation.
-        - adapter_name (str): The name of the adapter associated with the Conv2d operation.
-        - r (int): The parameter 'r' representing the number of features in the Conv2d operation.
-        - lora_alpha (int): The alpha value used in LoRA operations.
-        - lora_dropout (float): The dropout rate for LoRA operations.
-        - init_lora_weights (Union[bool, str]): Indicates whether to initialize LoRA weights or use a specific initialization method.
-        - use_rslora (bool): Flag indicating whether to use RSLora (Root-Sparse LoRA) functionality.
-        - use_dora (bool): Flag indicating whether to use DoRA (Densely Recurrent Adaptive) functionality.
-    
-    Methods:
-        - __init__: Initializes the Conv2d class with specified parameters and initializes LoRA operations.
-        - update_layer: Updates the specified adapter with the provided parameters for LoRA operations.
-        - merge: Merges the active adapter weights into the base weights, optionally performing a safe merge operation.
-        - unmerge: Unmerges all previously merged adapter layers from the base weights.
-        - get_delta_weight: Computes the delta weight for a given adapter based on LoRA weights.
-        - _get_weight_norm: Computes the norm of the weights based on scaling factors.
-        - _apply_dora: Calculates the output with DoRA applied for LoRA operations.
-        - forward: Constructs the Conv2d operation, incorporating LoRA functionality based on active adapters.
-        - __repr__: Returns a string representation of the Conv2d class prefixed with 'lora.'.
-    
-    Note: The Conv2d class extends the functionality of the underlying nn.Module and LoraLayer classes by incorporating adaptive learning mechanisms using LoRA operations.
-    """
-    # Lora implemented in a conv2d layer
-    def __init__(
-        self,
-        base_layer: nn.Module,
-        adapter_name: str,
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        init_lora_weights: Union[bool, str] = True,
-        use_rslora: bool = False,
-        use_dora: bool = False,
-        **kwargs,
-    ) -> None:
-        r"""
-        Initializes an instance of the Conv2d class.
-        
-        Args:
-            self: The instance of the Conv2d class.
-            base_layer (nn.Module): The base layer to be adapted.
-            adapter_name (str): The name of the adapter.
-            r (int, optional): The value of r. Defaults to 0.
-            lora_alpha (int, optional): The value of lora_alpha. Defaults to 1.
-            lora_dropout (float, optional): The value of lora_dropout. Defaults to 0.0.
-            init_lora_weights (Union[bool, str], optional): The value to initialize Lora weights. Defaults to True.
-            use_rslora (bool, optional): Flag to indicate whether to use RSLora. Defaults to False.
-            use_dora (bool, optional): Flag to indicate whether to use Dora. Defaults to False.
-            **kwargs: Additional keyword arguments.
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        """
-        super().__init__()
-        LoraLayer.__init__(self, base_layer)
-
-        self._active_adapter = adapter_name
-        self.update_layer(
-            adapter_name,
-            r,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            init_lora_weights=init_lora_weights,
-            use_rslora=use_rslora,
-            use_dora=use_dora,
-        )
-
-    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora):
-        r"""
-        Update the layer for the Conv2d class with the provided parameters.
-        
-        Args:
-        - self: The instance of the Conv2d class.
-        - adapter_name (str): The name of the adapter.
-        - r (int): The positive integer value representing the number of features for the adapter.
-        - lora_alpha (float): The alpha value for the LORA mechanism.
-        - lora_dropout (float): The dropout probability for the LORA mechanism. Should be in the range (0.0, 1.0].
-        - init_lora_weights (str or bool): The method to initialize LORA weights. Can be 'loftq' or a boolean value.
-        - use_rslora (bool): Flag indicating whether to use RS-LORA scaling.
-        - use_dora (bool): Flag indicating whether to use DORA for the adapter.
-        
-        Returns:
-        None. This method updates the Conv2d layer with the specified parameters.
-        
-        Raises:
-        - ValueError: If the value of `r` is less than or equal to 0.
-        """
-        if r <= 0:
-            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
-
-        self.r[adapter_name] = r
-        self.lora_alpha[adapter_name] = lora_alpha
-        if lora_dropout > 0.0:
-            lora_dropout_layer = nn.Dropout(p=lora_dropout)
-        else:
-            lora_dropout_layer = nn.Identity()
-
-        self.lora_dropout[adapter_name] = lora_dropout_layer
-        # Actual trainable parameters
-        base_layer = self.get_base_layer()
-        kernel_size = base_layer.kernel_size
-        stride = base_layer.stride
-        padding = base_layer.padding
-        self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
-        self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
-        if use_rslora:
-            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
-        else:
-            self.scaling[adapter_name] = lora_alpha / r
-
-        if init_lora_weights == "loftq":
-            self.loftq_init(adapter_name)
-        elif init_lora_weights:
-            self.reset_lora_parameters(adapter_name, init_lora_weights)
-
-        weight = getattr(base_layer, "weight", None)
-        if weight is not None:
-            # the layer is already completely initialized, this is an update
-            self.to(dtype=weight.dtype)
-
-        if use_dora:
-            # self.dora_init(adapter_name)
-            self.use_dora[adapter_name] = True
-        else:
-            self.use_dora[adapter_name] = False
-
-        self.set_adapter(self.active_adapters)
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        """
-        Merge the active adapter weights inside the base weights
-
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`list[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self.lora_A.keys():
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    # Note that safe_merge will be slower than the normal merge
-                    # because of the copy operation.
-                    orig_weights = base_layer.weight.data.clone()
-                    delta_weight = self.get_delta_weight(active_adapter)
-
-                    if not self.use_dora[active_adapter]:
-                        orig_weights = orig_weights + delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(orig_weights, delta_weight, scaling=1)
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        orig_weights = dora_factor.view(-1, 1, 1, 1) * (orig_weights + delta_weight)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-                    base_layer.weight.data = orig_weights
-                else:
-                    delta_weight = self.get_delta_weight(active_adapter)
-                    if not self.use_dora[active_adapter]:
-                        base_layer.weight.data = base_layer.weight.data + delta_weight
-                    else:
-                        # handle dora
-                        # since delta_weight already includes scaling, set it to 1 here
-                        weight_norm = self._get_weight_norm(base_layer.weight, delta_weight, scaling=1)
-                        # We need to cache weight_norm because it has to be based on the original weights. We
-                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
-                        # different value
-                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
-                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                        new_weight = dora_factor.view(-1, 1, 1, 1) * (base_layer.weight.data + delta_weight)
-                        base_layer.weight.data = new_weight
-
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.lora_A.keys():
-                weight = self.get_base_layer().weight
-                delta_weight = self.get_delta_weight(active_adapter)
-                if not self.use_dora[active_adapter]:
-                    weight.data -= delta_weight
-                else:
-                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
-                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
-                    weight_orig = weight.data / dora_factor.view(-1, 1, 1, 1) - delta_weight
-                    weight.data = weight_orig
-
-    def get_delta_weight(self, adapter) -> mindspore.Tensor:
-        """
-        Compute the delta weight for the given adapter.
-
-        Args:
-            adapter (str):
-                The name of the adapter for which the delta weight should be computed.
-        """
-        dtype = self.lora_A[adapter].weight.dtype
-
-        # In case users wants to merge the adapter weights that are in
-        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
-        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
-        weight_A = self.lora_A[adapter].weight
-        weight_B = self.lora_B[adapter].weight
-
-        # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117
-        if self.get_base_layer().weight.shape[2:4] == (1, 1):
-            # conv2d 1x1
-            output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(
-                3
-            ) * self.scaling[adapter]
-        else:
-            # conv2d 3x3
-            output_tensor = (
-                ops.conv2d(
-                    weight_A.permute(1, 0, 2, 3),
-                    weight_B,
-                ).permute(1, 0, 2, 3)
-                * self.scaling[adapter]
-            )
-
-        return output_tensor
-
-    def _get_weight_norm(self, weight, lora_weight, scaling) -> mindspore.Tensor:
-        r"""
-        Calculates and returns the normalized weight tensor for the Conv2d layer.
-        
-        Args:
-            self (Conv2d): The instance of the Conv2d class.
-            weight (mindspore.Tensor): The weight tensor of the Conv2d layer.
-            lora_weight (mindspore.Tensor): The additional weight tensor for LORA (Low-Rank Approximation).
-            scaling (float): The scaling factor to adjust the impact of lora_weight.
-        
-        Returns:
-            mindspore.Tensor: The normalized weight tensor after applying L2 normalization.
-        
-        Raises:
-            None.
-        
-        This method takes the weight tensor of the Conv2d layer, the additional lora_weight tensor, and a scaling factor as input. It calculates the normalized weight tensor by adding the scaled lora_weight
-tensor to the weight tensor. Then, it applies L2 normalization to the resulting tensor along dimensions (1, 2, 3) and returns the normalized weight tensor. The purpose of this method is to compute the weight
-normalization required for the Conv2d layer's computations.
-        """
-        # calculate L2 norm of weight matrix, channel-wise
-        weight = weight + scaling * lora_weight
-        # the following is needed to have compatibility with the 4D weight tensors of Conv2D
-        weight_norm = weight.norm(p=2, dim=(1, 2, 3), keepdim=True).swapaxes(1, 0)
-        return weight_norm
-
-    def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter):
-        """
-        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
-        output.
-        """
-        base_layer = self.get_base_layer()
-        weight = base_layer.weight
-        lora_weight = ops.mm(lora_B.weight.flatten(start_dim=1), lora_A.weight.flatten(start_dim=1))
-        lora_weight = lora_weight.reshape(weight.shape)
-        magnitude = self.lora_magnitude_vector[active_adapter]
-        weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
-        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
-        # "[...] we suggest treating ||V +∆V ||_c in
-        # Eq. (5) as a constant, thereby detaching it from the gradient
-        # graph. This means that while ||V + ∆V ||_c dynamically
-        # reflects the updates of ∆V , it won’t receive any gradient
-        # during backpropagation"
-        mag_norm_scale = magnitude / weight_norm
-        result_dora = (mag_norm_scale - 1) * (
-            ops.conv2d(
-                x,
-                weight,
-                bias=None,
-                stride=base_layer.stride,
-                padding=base_layer.padding,
-                dilation=base_layer.dilation,
-                groups=base_layer.groups,
-            )
-        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
-
-        return result_dora
-
-    def forward(self, x: mindspore.Tensor, *args, **kwargs) -> mindspore.Tensor:
-        r"""
-        Constructs a forward pass of the Conv2d layer.
-        
-        Args:
-            self (Conv2d): An instance of the Conv2d class.
-            x (mindspore.Tensor): The input tensor to the Conv2d layer.
-                It should have a shape of (batch_size, channels, height, width).
-        
-        Returns:
-            mindspore.Tensor: The output tensor after passing through the Conv2d layer.
-                It has the same shape as the input tensor.
-        
-        Raises:
-            ValueError: If the input tensor is not provided.
-            TypeError: If the input tensor is not of type mindspore.Tensor.
-        """
-        self._check_forward_args(x, *args, **kwargs)
-        adapter_names = kwargs.pop("adapter_names", None)
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self.base_layer(x, *args, **kwargs)
-        elif adapter_names is not None:
-            result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
-        elif self.merged:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-            torch_result_dtype = result.dtype
-
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.lora_A.keys():
-                    continue
-                lora_A = self.lora_A[active_adapter]
-                lora_B = self.lora_B[active_adapter]
-                dropout = self.lora_dropout[active_adapter]
-                scaling = self.scaling[active_adapter]
-                x = x.to(lora_A.weight.dtype)
-
-                if not self.use_dora[active_adapter]:
-                    result = result + lora_B(lora_A(dropout(x))) * scaling
-                else:
-                    x = dropout(x)
-                    result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
-
-            result = result.to(torch_result_dtype)
-        return result
-
-    def __repr__(self) -> str:
-        r"""
-        Method '__repr__' in the class 'Conv2d'.
-        
-        Args:
-            self: Conv2d - The instance of the Conv2d class.
-                Represents the current object instance.
-        
-        Returns:
-            str - A string representation of the object.
-            Returns a string prefixed with 'lora.', which is a concatenation of the superclass's string representation.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        rep = super().__repr__()
-        return "lora." + rep
-
-
-def dispatch_default(
-    target: nn.Module,
-    adapter_name: str,
-    lora_config: LoraConfig,
-    **kwargs,
-) -> Optional[nn.Module]:
-    r"""
-    Dispatches the default adapter for different types of neural network layers.
-    
-    Args: 
-        target (nn.Module): The target neural network layer for which the adapter is being dispatched.
-        adapter_name (str): The name of the adapter being used.
-        lora_config (LoraConfig): Configuration object containing LoftQ configuration settings.
-        
-    Returns: 
-        Optional[nn.Module]: The new cell representing the adapted version of the target neural network layer, or None if no adapter is dispatched.
-        
-    Raises: 
-        - KeyError: If required keys are not found in the input kwargs.
-        - TypeError: If the input target is not a valid neural network cell.
-        - Warning: If conflicting settings are detected for fan_in_fan_out parameter.
-    """
-    new_cell = None
-
-    if isinstance(target, BaseTunerLayer):
-        target_base_layer = target.get_base_layer()
-    else:
-        target_base_layer = target
-
-    if isinstance(target_base_layer, nn.Embedding):
-        embedding_kwargs = kwargs.copy()
-        embedding_kwargs.pop("fan_in_fan_out", None)
-        embedding_kwargs.update(lora_config.loftq_config)
-        new_cell = Embedding(target, adapter_name, **embedding_kwargs)
-    elif isinstance(target_base_layer, nn.Conv2d):
-        kwargs.update(lora_config.loftq_config)
-        new_cell = Conv2d(target, adapter_name, **kwargs)
-    elif isinstance(target_base_layer, nn.Linear):
-        if kwargs["fan_in_fan_out"]:
-            warnings.warn(
-                "fan_in_fan_out is set to True but the target cell is `torch.nn.Linear`. "
-                "Setting fan_in_fan_out to False."
-            )
-            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
-        kwargs.update(lora_config.loftq_config)
-        new_cell = Linear(target, adapter_name, **kwargs)
-    elif isinstance(target_base_layer, Conv1D):
-        if not kwargs["fan_in_fan_out"]:
-            warnings.warn(
-                "fan_in_fan_out is set to False but the target cell is `Conv1D`. Setting fan_in_fan_out to True."
-            )
-            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
-        kwargs.update(lora_config.loftq_config)
-        new_cell = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs)
-
-    return new_cell
diff --git a/mindnlp/peft/tuners/lora/model.py b/mindnlp/peft/tuners/lora/model.py
deleted file mode 100644
index 8d2a72b07..000000000
--- a/mindnlp/peft/tuners/lora/model.py
+++ /dev/null
@@ -1,1066 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lora model"""
-from __future__ import annotations
-
-import math
-import operator
-import re
-import warnings
-from contextlib import contextmanager
-from dataclasses import asdict, replace
-from enum import Enum
-from functools import partial, reduce
-from itertools import chain
-from typing import Optional
-try:
-    from typing import Literal
-except:
-    from typing_extensions import Literal
-
-from tqdm import tqdm
-import mindspore
-
-from mindnlp.core import nn, ops
-from ..tuners_utils import (
-    BaseTuner,
-    BaseTunerLayer,
-    check_target_module_exists,
-    # onload_layer,
-    replicate_layers,
-)
-from ...utils import (
-    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-    _freeze_adapter,
-    _get_submodules,
-)
-from ...utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties
-
-from .config import LoraConfig
-from .layer import Conv2d, LoraLayer, dispatch_default
-
-
-def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names):
-    """
-    Args:
-        target (object): The target object to which the hook is applied.
-        args (tuple): The positional arguments passed to the function.
-        kwargs (dict): The keyword arguments passed to the function.
-        adapter_names (list): The list of adapter names.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None
-    """
-    # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference
-    kwargs["adapter_names"] = adapter_names
-    return args, kwargs
-
-
-class LoraModel(BaseTuner):
-    """
-    Creates Low Rank Adapter (LoRA) model from a pretrained transformers model.
-
-    The method is described in detail in https://arxiv.org/abs/2106.09685.
-
-    Args:
-        model ([`nn.Module`]): The model to be adapted.
-        config ([`LoraConfig`]): The configuration of the Lora model.
-        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
-
-    Returns:
-        LoraModel ([`mindspore.nn.Module`]): The Lora model.
-
-    Example:
-
-        ```py
-        >>> from transformers import AutoModelForSeq2SeqLM
-        >>> from peft import LoraModel, LoraConfig
-
-        >>> config = LoraConfig(
-        ...     task_type="SEQ_2_SEQ_LM",
-        ...     r=8,
-        ...     lora_alpha=32,
-        ...     target_modules=["q", "v"],
-        ...     lora_dropout=0.01,
-        ... )
-
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-        >>> lora_model = LoraModel(model, config, "default")
-        ```
-
-        ```py
-        >>> import torch
-        >>> import transformers
-        >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
-
-        >>> rank = ...
-        >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"]
-        >>> config = LoraConfig(
-        ...     r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
-        ... )
-        >>> quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
-
-        >>> tokenizer = transformers.AutoTokenizer.from_pretrained(
-        ...     "kakaobrain/kogpt",
-        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
-        ...     bos_token="[BOS]",
-        ...     eos_token="[EOS]",
-        ...     unk_token="[UNK]",
-        ...     pad_token="[PAD]",
-        ...     mask_token="[MASK]",
-        ... )
-        >>> model = transformers.GPTJForCausalLM.from_pretrained(
-        ...     "kakaobrain/kogpt",
-        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
-        ...     pad_token_id=tokenizer.eos_token_id,
-        ...     use_cache=False,
-        ...     device_map={"": rank},
-        ...     ms_dtype=torch.float16,
-        ...     quantization_config=quantization_config,
-        ... )
-        >>> model = prepare_model_for_kbit_training(model)
-        >>> lora_model = get_peft_model(model, config)
-        ```
-
-    > **Attributes**:  
-
-    >   - **model** ([`transformers.PreTrainedModel`])— The model to be adapted. 
-
-    >   - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
-    """
-    prefix: str = "lora_"
-
-    def _check_new_adapter_config(self, config: LoraConfig) -> None:
-        """
-        A helper method to check the config when a new adapter is being added.
-
-        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
-
-        """
-        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
-        # does not fully correspond to the error message.
-        if (len(self.peft_config) > 1) and (config.bias != "none"):
-            raise ValueError(
-                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
-                "set bias to 'none' for all adapters."
-            )
-
-    @staticmethod
-    def _check_target_module_exists(lora_config, key):
-        r"""
-        Checks if the target cell exists in the LoRa configuration.
-        
-        Args:
-            lora_config (dict): A dictionary containing the LoRa configuration.
-                This dictionary should have the following structure:
-                {
-                    "target_modules": {
-                        "cell1": {
-                            ...
-                        },
-                        "cell2": {
-                            ...
-                        },
-                        ...
-                    },
-                    ...
-                }
-                The 'target_modules' key should contain the target cell information.
-            key (str): The key to identify the target cell.
-                The key should be a string that matches the key used in the 'target_modules' dictionary.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return check_target_module_exists(lora_config, key)
-
-    def _prepare_model(self, peft_config: LoraConfig, model: nn.Module):
-        r"""
-        A private method to modify the model structure before adapter is applied.
-
-        Args:
-            peft_config (`PeftConfig`):
-                The prepared adapter config.
-            model (`nn.Module`):
-                The model that is going to be adapted.
-        """
-        if peft_config.layer_replication:
-            replicate_layers(model, peft_config.layer_replication)
-
-    def _create_and_replace(
-        self,
-        lora_config,
-        adapter_name,
-        target,
-        target_name,
-        parent,
-        current_key,
-    ):
-        r"""
-        Creates a new cell and replaces an existing cell in the LoraModel.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            lora_config (LoraConfig): The LoraConfig object containing Lora configuration parameters.
-            adapter_name (str): The name of the adapter.
-            target (LoraLayer): The target LoraLayer or AdaLoraLayer object to update or replace.
-            target_name (str): The name of the target layer.
-            parent (nn.Module): The parent module to which the target layer belongs.
-            current_key: The current key used for matching patterns.
-        
-        Returns:
-            None. The method modifies the LoraModel by creating and replacing modules.
-        
-        Raises:
-            ValueError: If the current_key is None.
-        
-        Note:
-            This method dynamically determines the appropriate rank (r) and alpha (lora_alpha) values
-            based on the current_key and the pattern keys defined in the lora_config. It then creates
-            a new cell with the specified lora configuration parameters and replaces the existing
-            cell with the new cell in the LoraModel.
-        
-            If the target is an instance of LoraLayer (but not AdaLoraLayer), the method updates
-            the layer with the specified adapter_name, rank (r), lora_alpha, lora_dropout,
-            init_lora_weights, use_rslora, and use_dora parameters.
-        
-            If the target is not an instance of LoraLayer, the method creates a new cell using the
-            _create_new_cell method with the specified lora configuration parameters. If the adapter_name
-            is not in the active_adapters list, the requires_grad attribute of the new cell is set to False.
-        
-            The method then replaces the existing cell in the parent module with the new cell using
-            the _replace_cell method.
-        """
-        if current_key is None:
-            raise ValueError("Current Key shouldn't be `None`")
-
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(chain(lora_config.rank_pattern.keys(), lora_config.alpha_pattern.keys()))
-        target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key)
-        r = lora_config.rank_pattern.get(target_name_key, lora_config.r)
-        alpha = lora_config.alpha_pattern.get(target_name_key, lora_config.lora_alpha)
-
-        kwargs = {
-            "r": r,
-            "lora_alpha": alpha,
-            "lora_dropout": lora_config.lora_dropout,
-            "fan_in_fan_out": lora_config.fan_in_fan_out,
-            "init_lora_weights": lora_config.init_lora_weights,
-            "use_rslora": lora_config.use_rslora,
-            "use_dora": lora_config.use_dora,
-            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
-            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
-        }
-
-        # quant_methods = ["gptq", "aqlm", "awq"]
-        # for quant_method in quant_methods:
-        #     quantization_config = get_quantization_config(self.model, method=quant_method)
-        #     if quantization_config is not None:
-        #         kwargs[f"{quant_method}_quantization_config"] = quantization_config
-
-        # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it
-        from ..adalora import AdaLoraLayer
-
-        if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer):
-            target.update_layer(
-                adapter_name,
-                r,
-                lora_alpha=alpha,
-                lora_dropout=lora_config.lora_dropout,
-                init_lora_weights=lora_config.init_lora_weights,
-                use_rslora=lora_config.use_rslora,
-                use_dora=lora_config.use_dora,
-            )
-        else:
-            new_cell = self._create_new_cell(lora_config, adapter_name, target, **kwargs)
-            if adapter_name not in self.active_adapters:
-                # adding an additional adapter: it is not automatically trainable
-                new_cell.requires_grad = False
-            self._replace_cell(parent, target_name, new_cell, target)
-
-    def _replace_cell(self, parent, child_name, new_cell, child):
-        r"""
-        This method replaces a cell within the LoraModel by updating the specified child of the parent with a new cell.
-        
-        Args:
-            self (object): The instance of the LoraModel class.
-            parent (object): The parent object where the cell replacement will occur.
-            child_name (str): The name of the child attribute within the parent object.
-            new_cell (object): The new cell object that will replace the existing child within the parent.
-            child (object): The existing child object that will be replaced by the new_cell.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised within this method.
-        """
-        setattr(parent, child_name, new_cell)
-        # It's not necessary to set requires_grad here, as that is handled by
-        # _mark_only_adapters_as_trainable
-
-        # child layer wraps the original cell, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        if not hasattr(new_cell, "base_layer"):
-            new_cell.weight = child.weight
-            if hasattr(child, "bias"):
-                new_cell.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        r"""
-        Marks only specific adapters in the model as trainable based on the specified bias configuration.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            model (nn.Module): The neural network model on which to apply the trainable markings.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            NotImplementedError: If the requested bias configuration is not implemented.
-        """
-        for n, p in model.parameters_and_names():
-            if self.prefix not in n:
-                p.requires_grad = False
-
-        for active_adapter in self.active_adapters:
-            bias = self.peft_config[active_adapter].bias
-            if bias == "none":
-                continue
-
-            if bias == "all":
-                for n, p in model.parameters_and_names():
-                    if "bias" in n:
-                        p.requires_grad = True
-            elif bias == "lora_only":
-                for m in model.modules():
-                    if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
-                        m.bias.requires_grad = True
-            else:
-                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
-
-    @staticmethod
-    def _create_new_cell(lora_config, adapter_name, target, **kwargs):
-        r"""
-        Method to create a new cell based on the provided parameters.
-        
-        Args:
-            lora_config (dict): The configuration parameters for the Lora model.
-            adapter_name (str): The name of the adapter to be used.
-            target (torch.nn.Module): The target cell for which a new cell needs to be created.
-        
-        Returns:
-            None. Returns the newly created cell based on the specified target.
-        
-        Raises:
-            ValueError: If the target cell is not supported. Currently supported modules include `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, and `transformers.pytorch_utils.Conv1D`.
-        """
-        # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters,
-        # because the first match is always used. Therefore, the default layers should be checked last.
-        dispatchers = [dispatch_default]
-
-        new_cell = None
-        for dispatcher in dispatchers:
-            new_cell = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs)
-            if new_cell is not None:  # first match wins
-                break
-
-        if new_cell is None:
-            # no cell could be matched
-            raise ValueError(
-                f"Target cell {target} is not supported. Currently, only the following modules are supported: "
-                "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`."
-            )
-
-        return new_cell
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def get_peft_config_as_dict(self, inference: bool = False):
-        r"""
-        Returns a dictionary representation of the PEFT config.
-        
-        Args:
-            self: An instance of the LoraModel class.
-            inference (bool): A flag indicating whether the method is called for inference. Default is False.
-        
-        Returns:
-            dict: A dictionary containing the PEFT config. The keys represent the configuration options, and the values
-                  represent their corresponding values. If 'inference' is True, the dictionary will also include the
-                  'inference_mode' key set to True.
-        
-        Raises:
-            None.
-        
-        Note:
-            - The method uses the 'peft_config' attribute of the LoraModel instance to create the dictionary.
-            - If a value in the 'peft_config' attribute is an instance of Enum, its value will be extracted using the
-              'value' attribute.
-            - The 'config_dict' dictionary will only contain one key-value pair. If the 'inference' flag is True, the
-              'config_dict' will be updated to include the 'inference_mode' key.
-        
-        Example usage:
-            model = LoraModel()
-            config = model.get_peft_config_as_dict(inference=True)
-            print(config)  # {'inference_mode': True}
-        
-            config = model.get_peft_config_as_dict()
-            print(config)  # {}
-        
-        """
-        config_dict = {}
-        for key, value in self.peft_config.items():
-            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
-            if inference:
-                config["inference_mode"] = True
-        config_dict[key] = config # pylint: disable=undefined-loop-variable
-        return config
-
-    def _set_adapter_layers(self, enabled: bool = True) -> None:
-        r"""
-        Sets the adapter layers for the LoraModel.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            enabled (bool, optional): A flag to enable or disable the adapter layers. Defaults to True.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, (BaseTunerLayer, ModulesToSaveWrapper)):
-                cell.enable_adapters(enabled)
-
-    def enable_adapter_layers(self) -> None:
-        """Enable all adapters.
-
-        Call this if you have previously disabled all adapters and want to re-enable them.
-        """
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self) -> None:
-        """Disable all adapters.
-
-        When disabling all adapters, the model output corresponds to the output of the base model.
-        """
-        for active_adapter in self.active_adapters:
-            val = self.peft_config[active_adapter].bias
-            if val != "none":
-                msg = (
-                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
-                    "output as the the base model would without adaption."
-                )
-                warnings.warn(msg)
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name: str | list[str]) -> None:
-        """Set the active adapter(s).
-
-        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.parameters_and_names():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, LoraLayer):
-                if cell.merged:
-                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
-                    cell.unmerge()
-                cell.set_adapter(adapter_name)
-        self.active_adapter = adapter_name
-
-    @contextmanager
-    def _enable_peft_forward_hooks(self, *args, **kwargs):
-        r"""
-        Enable PEFT forward hooks for the LoraModel class.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            
-        Returns:
-            None. This method is intended to be used as a context manager and does not explicitly return a value.
-        
-        Raises:
-            ValueError: If the 'adapter_names' parameter is provided while the model is in training mode.
-        """
-        # If adapter_names is passed as an argument, we inject it into the forward arguments.
-        adapter_names = kwargs.pop("adapter_names", None)
-        if adapter_names is None:
-            # nothing to do
-            yield
-            return
-
-        if self.training:
-            raise ValueError("Cannot pass `adapter_names` when the model is in training mode.")
-
-        hook_handles = []
-        for cell in self.modules():
-            if isinstance(cell, LoraLayer):
-                pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names)
-                handle = cell.register_forward_pre_hook(pre_forward, with_kwargs=True)
-                hook_handles.append(handle)
-
-        yield
-
-        for handle in hook_handles:
-            handle.remove()
-
-    def _check_merge_allowed(self):
-        """Verify that the configuration supports merging.
-
-        Currently gptq quantization and replicated layers do not support merging.
-        """
-        if getattr(self.model, "quantization_method", None) == "gptq":
-            raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
-        if self.peft_config.get("layer_replication"):
-            raise ValueError("Cannot merge LORA layers when base model layers are replicated")
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        r"""
-        Prepare the adapter configuration for a LoraModel.
-        
-        This method takes two parameters, peft_config and model_config, and returns None.
-        
-        Args:
-            peft_config (PeftConfig): The configuration for the adapter.
-                - target_modules (set): The target modules for the adapter. If not specified, it will be determined based on the model type.
-            model_config (dict): The configuration for the model.
-                - model_type (str): The type of the model.
-        
-        Returns:
-            None. The method does not return any value.
-        
-        Raises:
-            ValueError: If the target_modules is not specified in peft_config and the model_type is not found in the TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.
-        
-        """
-        if peft_config.target_modules is None:
-            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = set(
-                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
-            )
-        return peft_config
-
-    def _unload_and_optionally_merge(
-        self,
-        merge=True,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ):
-        r"""
-        Method to unload and optionally merge a LoraModel.
-        
-        Args:
-        - self: The instance of the LoraModel class.
-        - merge (bool): Flag indicating whether to perform a merge operation.
-        - progressbar (bool): Flag indicating whether to display a progress bar during unloading.
-        - safe_merge (bool): Flag indicating whether to perform a safe merge operation.
-        - adapter_names (Optional[list[str]]): List of names of adapters to consider during unloading.
-        
-        Returns:
-        None. The method modifies the model in place.
-        
-        Raises:
-        - AttributeError: If an attribute error occurs during the unloading process.
-        """
-        if merge:
-            self._check_merge_allowed()
-
-        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
-        desc = "Unloading " + ("and merging " if merge else "") + "model"
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-            # with onload_layer(target):
-            #     if hasattr(target, "base_layer"):
-            #         if merge:
-            #             target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-            #         self._replace_cell(parent, target_name, target.get_base_layer(), target)
-            #     elif isinstance(target, ModulesToSaveWrapper):
-            #         # save any additional trainable modules part of `modules_to_save`
-            #         new_cell = target.modules_to_save[target.active_adapter]
-            #         if hasattr(new_cell, "base_layer"):
-            #             # check if the cell is itself a tuner layer
-            #             if merge:
-            #                 new_cell.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-            #             new_cell = new_cell.get_base_layer()
-            #         setattr(parent, target_name, new_cell)
-
-        return self.model
-
-    def _check_add_weighted_adapter(
-        self, adapters: list[str], combination_type: str, svd_rank: int | None
-    ) -> tuple[str, int, str]:
-        """
-        Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying
-        model.
-        """
-        for adapter in adapters:
-            if adapter not in list(self.peft_config.keys()):
-                raise ValueError(f"Adapter {adapter} does not exist")
-
-        # If more than one of the adapters targets the same cell with modules_to_save, raise an error, as these
-        # modules cannot be merged. First, find the ModulesToSaveWrapper instances in the model, then check if they
-        # have modules for the adapters to be merged.
-        modules_to_save_wrappers = [cell for cell in self.modules() if isinstance(cell, ModulesToSaveWrapper)]
-        problematic_wrappers = [
-            wrapper
-            for wrapper in modules_to_save_wrappers
-            if sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1
-        ]
-        if problematic_wrappers:
-            raise ValueError(
-                "Cannot add weighted adapters if they target the same cell with modules_to_save, but found "
-                f"{len(problematic_wrappers)} such instance(s)."
-            )
-
-        # if there is only one adapter, we can only use linear merging
-        combination_type = "linear" if len(adapters) == 1 else combination_type
-
-        adapters_ranks = [self.peft_config[adapter].r for adapter in adapters]
-        if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"):
-            # all adapters ranks should be same, new rank is just this value
-            if len(set(adapters_ranks)) != 1:
-                raise ValueError(
-                    "All adapters must have the same r value when using combination_type linear, ties, dare_ties or "
-                    "dare_linear."
-                )
-            new_rank = adapters_ranks[0]
-        elif combination_type == "cat":
-            # adapters ranks may be different, new rank is sum of all ranks
-            # be careful, because output adapter rank may be really big if mixing a lot of adapters
-            new_rank = sum(adapters_ranks)
-        elif combination_type.endswith("svd"):
-            # new rank is the max of all ranks of the adapters if not provided
-            new_rank = svd_rank or max(adapters_ranks)
-        else:
-            raise ValueError(f"Invalid combination_type: {combination_type}")
-
-        target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters]
-        if not target_module_types:
-            raise ValueError(f"Found no adapter matching the names in {adapters}")
-        if len(set(target_module_types)) > 1:
-            raise ValueError(
-                "all adapter configs should follow the same target modules type. "
-                "Combining adapters with `target_modules` type being a mix of list/set and string is not supported."
-            )
-
-        if target_module_types[0] == str:
-            new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters)
-        elif target_module_types[0] == set:
-            new_target_modules = reduce(
-                operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters)
-            )
-        else:
-            raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules")
-
-        return combination_type, new_rank, new_target_modules
-
-    def add_weighted_adapter(
-        self,
-        adapters: list[str],
-        weights: list[float],
-        adapter_name: str,
-        combination_type: str = "svd",
-        svd_rank: int | None = None,
-        svd_clamp: int | None = None,
-        svd_full_matrices: bool = True,
-        density: float | None = None,
-        majority_sign_method: Literal["total", "frequency"] = "total",
-    ) -> None:
-        """
-        This method adds a new adapter by merging the given adapters with the given weights.
-
-        When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to
-        the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM
-        errors.
-
-        Args:
-            adapters (`list`):
-                List of adapter names to be merged.
-            weights (`list`):
-                List of weights for each adapter.
-            adapter_name (`str`):
-                Name of the new adapter.
-            combination_type (`str`):
-                The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`,
-                `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat`
-                combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the
-                mixed adapter may be too big and result in OOM errors).
-            svd_rank (`int`, *optional*):
-                Rank of output adapter for svd. If None provided, will use max rank of merging adapters.
-            svd_clamp (`float`, *optional*):
-                A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform
-                clamping. Defaults to None.
-            svd_full_matrices (`bool`, *optional*):
-                Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned
-                tensors U and Vh. Defaults to True.
-            density (`float`, *optional*):
-                Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used
-                with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`,
-                `magnintude_prune`, `magnitude_prune_svd`]
-            majority_sign_method (`str`):
-                The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values.
-                Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`]
-        """
-        if adapter_name in list(self.peft_config.keys()):
-            return
-        for adapter in adapters:
-            if adapter not in list(self.peft_config.keys()):
-                raise ValueError(f"Adapter {adapter} does not exist")
-
-        combination_type, new_rank, new_target_modules = self._check_add_weighted_adapter(
-            adapters=adapters,
-            combination_type=combination_type,
-            svd_rank=svd_rank,
-        )
-
-        self.peft_config[adapter_name] = replace(
-            self.peft_config[adapters[0]],
-            r=new_rank,
-            lora_alpha=new_rank,
-            target_modules=new_target_modules,
-        )
-        self.inject_adapter(self.model, adapter_name)
-
-        # Do we really need that?
-        _freeze_adapter(self.model, adapter_name)
-
-        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
-        for key in key_list:
-            _, target, _ = _get_submodules(self.model, key)
-            if isinstance(target, LoraLayer):
-                if adapter_name in target.lora_A:
-                    target_lora_A = target.lora_A[adapter_name].weight
-                    target_lora_B = target.lora_B[adapter_name].weight
-                elif adapter_name in target.lora_embedding_A:
-                    target_lora_A = target.lora_embedding_A[adapter_name]
-                    target_lora_B = target.lora_embedding_B[adapter_name]
-                else:
-                    continue
-
-                target_lora_A.data = target_lora_A.data * 0.0
-                target_lora_B.data = target_lora_B.data * 0.0
-                if combination_type == "cat":
-                    loras_A, loras_B = [], []
-                    for adapter, weight in zip(adapters, weights):
-                        if adapter in target.lora_A:
-                            current_adapter_lora_A = target.lora_A[adapter].weight
-                            current_adapter_lora_B = target.lora_B[adapter].weight
-                        elif adapter in target.lora_embedding_A:
-                            current_adapter_lora_A = target.lora_embedding_A[adapter]
-                            current_adapter_lora_B = target.lora_embedding_B[adapter]
-                        else:
-                            continue
-                        loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter])
-                        loras_B.append(current_adapter_lora_B.data)
-
-                    if len(loras_A) == 0:
-                        raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.")
-                    loras_A = ops.cat(loras_A, dim=0)
-                    loras_B = ops.cat(loras_B, dim=1)
-                    target_lora_A.data[: loras_A.shape[0], :] = loras_A
-                    target_lora_B.data[:, : loras_B.shape[1]] = loras_B
-                elif combination_type in [
-                    "svd",
-                    "ties_svd",
-                    "dare_linear_svd",
-                    "dare_ties_svd",
-                    "magnitude_prune_svd",
-                ]:
-                    target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter(
-                        combination_type,
-                        adapters,
-                        weights,
-                        new_rank,
-                        target,
-                        target_lora_A,
-                        target_lora_B,
-                        density,
-                        majority_sign_method,
-                        svd_clamp,
-                        full_matrices=svd_full_matrices,
-                    )
-                elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]:
-                    target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter(
-                        combination_type, adapters, weights, target, density, majority_sign_method
-                    )
-
-    def _svd_generalized_task_arithmetic_weighted_adapter(
-        self,
-        combination_type,
-        adapters,
-        weights,
-        new_rank,
-        target,
-        target_lora_A,
-        target_lora_B,
-        density,
-        majority_sign_method,
-        clamp=None,
-        full_matrices=True,
-    ):
-        r"""Perform a Singular Value Decomposition (SVD) with various combination types on the given parameters.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            combination_type (str): The type of combination to perform. Valid options are:
-                - 'svd': Standard SVD combination.
-                - 'ties_svd': Combination with ties.
-                - 'dare_linear_svd': Combination with DARE (Density-Aware Ranking Evaluation) using linear interpolation.
-                - 'dare_ties_svd': Combination with DARE (Density-Aware Ranking Evaluation) using ties.
-                - 'magnitude_prune_svd': Combination with magnitude pruning.
-            adapters (list): A list of adapters to consider for the combination.
-            weights (list): A list of weights corresponding to the adapters.
-            new_rank (int): The desired new rank after the combination.
-            target: The target object.
-            target_lora_A: The target LoRA A object.
-            target_lora_B: The target LoRA B object.
-            density (float): The density parameter used in combination types 'ties_svd', 'dare_linear_svd', 'dare_ties_svd', and 'magnitude_prune_svd'.
-            majority_sign_method (str): The majority sign method used in combination types 'ties_svd' and 'dare_ties_svd'. Valid options are:
-                - 'positive': Majority sign is positive.
-                - 'negative': Majority sign is negative.
-                - 'absolute': Majority sign is absolute.
-            clamp (float, optional): The clamping value. Defaults to None.
-            full_matrices (bool, optional): Whether to compute full matrices in the SVD computation. Defaults to True.
-        
-        Returns:
-            None
-        
-        Raises:
-            ValueError: If no matching LoRAs are found.
-            ValueError: If an invalid value is passed to the combination_type parameter.
-        
-        """
-        valid_adapters = []
-        valid_weights = []
-        is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters)
-        for adapter, weight in zip(adapters, weights):
-            if adapter in target.lora_A or adapter in target.lora_embedding_A:
-                valid_adapters.append(adapter)
-                valid_weights.append(weight * target.scaling[adapter])
-
-        # if no valid adapter, nothing to do
-        if len(valid_adapters) == 0:
-            raise ValueError("No matching LoRAs found. Please raise an issue on Github.")
-        delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters]
-        valid_weights = mindspore.tensor(valid_weights)
-        if combination_type == "svd":
-            delta_weight = task_arithmetic(delta_weight, valid_weights)
-        elif combination_type == "ties_svd":
-            delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method)
-        elif combination_type == "dare_linear_svd":
-            delta_weight = dare_linear(delta_weight, valid_weights, density)
-        elif combination_type == "dare_ties_svd":
-            delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method)
-        elif combination_type == "magnitude_prune_svd":
-            delta_weight = magnitude_prune(delta_weight, valid_weights, density)
-        else:
-            raise ValueError(f"Invalid value passed to combination type: {combination_type}")
-
-        conv2d = isinstance(target, Conv2d)
-        if conv2d:
-            conv2d_1x1 = target.weight.shape[2:4] == (1, 1)
-            if not conv2d_1x1:
-                delta_weight = delta_weight.flatten(start_dim=1)
-            else:
-                delta_weight = delta_weight.squeeze()
-        if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding:
-            delta_weight = delta_weight.T
-
-        # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131
-        U, S, Vh = ops.svd(delta_weight, full_matrices=full_matrices)
-        U = U[:, :new_rank]
-        S = S[:new_rank]
-        U = U @ ops.diag(S)
-        Vh = Vh[:new_rank, :]
-        if clamp is not None:
-            dist = ops.cat([U.flatten(), Vh.flatten()])
-            hi_val = ops.quantile(dist, clamp)
-            low_val = -hi_val
-            U = U.clamp(low_val, hi_val)
-            Vh = Vh.clamp(low_val, hi_val)
-        if conv2d:
-            U = U.reshape(target_lora_B.data.shape)
-            Vh = Vh.reshape(target_lora_A.data.shape)
-        return Vh, U
-
-    def _generalized_task_arithmetic_weighted_adapter(
-        self,
-        combination_type,
-        adapters,
-        weights,
-        target,
-        density,
-        majority_sign_method,
-    ):
-        r"""
-        Generalized Task Arithmetic Weighted Adapter.
-        
-        This method performs a weighted combination of task arithmetic operations on the given adapters and their corresponding weights.
-        The combination type determines the specific arithmetic operation to be applied.
-        
-        Args:
-            self (LoraModel): The instance of the LoraModel class.
-            combination_type (str): The type of combination to be performed. Valid values are:
-                - 'linear': Perform a linear combination of the task tensors.
-                - 'ties': Perform a combination of task tensors with tie handling.
-                - 'dare_linear': Perform a linear combination of task tensors with density-aware regularization.
-                - 'dare_ties': Perform a combination of task tensors with tie handling and density-aware regularization.
-                - 'magnitude_prune': Perform a combination of task tensors with magnitude pruning.
-            adapters (list): A list of adapter names.
-            weights (list): A list of weights corresponding to the adapters.
-            target (Target): The target object containing the lora_A, lora_B, lora_embedding_A, and lora_embedding_B attributes.
-            density (float): The density parameter for density-aware regularization.
-            majority_sign_method (str): The method to determine the sign of the majority in tie handling. Valid values are:
-                - 'positive': The majority is considered positive.
-                - 'negative': The majority is considered negative.
-        
-        Returns:
-            list: A list containing the combined task tensors for lora_A and lora_B.
-        
-        Raises:
-            ValueError: If the combination_type parameter is not one of the valid combination types.
-        """
-        # account weights for LoRA A and B layers.
-        valid_weights = []
-        lora_A_deltas = []
-        lora_B_deltas = []
-        for adapter, weight in zip(adapters, weights):
-            if adapter in target.lora_A:
-                current_adapter_lora_A = target.lora_A[adapter].weight
-                current_adapter_lora_B = target.lora_B[adapter].weight
-            elif adapter in target.lora_embedding_A:
-                current_adapter_lora_A = target.lora_embedding_A[adapter]
-                current_adapter_lora_B = target.lora_embedding_B[adapter]
-            else:
-                continue
-            valid_weights.append(math.sqrt(weight * target.scaling[adapter]))
-            lora_A_deltas.append(current_adapter_lora_A.data)
-            lora_B_deltas.append(current_adapter_lora_B.data)
-        valid_weights = mindspore.tensor(valid_weights)
-        lora_deltas = [lora_A_deltas, lora_B_deltas]
-        dtype = lora_A_deltas[0].dtype
-        for i, task_tensors in enumerate(lora_deltas):
-            if combination_type == "linear":
-                lora_deltas[i] = task_arithmetic(task_tensors, valid_weights)
-            elif combination_type == "ties":
-                lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method)
-            elif combination_type == "dare_linear":
-                lora_deltas[i] = dare_linear(task_tensors, valid_weights, density)
-            elif combination_type == "dare_ties":
-                lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method)
-            elif combination_type == "magnitude_prune":
-                lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density)
-            else:
-                raise ValueError("Invalid combination type")
-        lora_deltas = [delta.to(dtype) for delta in lora_deltas]
-        return lora_deltas
-
-    def delete_adapter(self, adapter_name: str) -> None:
-        """
-        Deletes an existing adapter.
-
-        Args:
-            adapter_name (str): Name of the adapter to be deleted.
-        """
-        if adapter_name not in list(self.peft_config.keys()):
-            raise ValueError(f"Adapter {adapter_name} does not exist")
-        del self.peft_config[adapter_name]
-
-        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
-        new_adapter = None
-        for key in key_list:
-            _, target, _ = _get_submodules(self.model, key)
-            if isinstance(target, LoraLayer):
-                target.delete_adapter(adapter_name)
-                if new_adapter is None:
-                    new_adapter = target.active_adapters[:]
-
-        self.active_adapter = new_adapter or []
-
-    def merge_and_unload(
-        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
-    ) -> nn.Module:
-        r"""
-        This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model
-        as a standalone model.
-
-        Args:
-            progressbar (`bool`):
-                whether to show a progressbar indicating the unload and merge process
-            safe_merge (`bool`):
-                whether to activate the safe merging check to check if there is any potential Nan in the adapter
-                weights
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-        Example:
-
-        ```py
-        >>> from transformers import AutoModelForCausalLM
-        >>> from peft import PeftModel
-
-        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
-        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
-        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
-        >>> merged_model = model.merge_and_unload()
-        ```
-        """
-        return self._unload_and_optionally_merge(
-            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
-        )
-
-    def unload(self) -> nn.Module:
-        """
-        Gets back the base model by removing all the lora modules without merging. This gives back the original base
-        model.
-        """
-        return self._unload_and_optionally_merge(merge=False)
diff --git a/mindnlp/peft/tuners/lycoris_utils.py b/mindnlp/peft/tuners/lycoris_utils.py
deleted file mode 100644
index 7b6d0502f..000000000
--- a/mindnlp/peft/tuners/lycoris_utils.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-LycorisConfig and LycorisLayer class for LyCORIS like adapters.
-"""
-from __future__ import annotations
-from abc import abstractmethod
-from dataclasses import dataclass, field
-from typing import Any, Optional, Union
-
-import warnings
-import mindspore
-
-from tqdm import tqdm
-
-from mindnlp.core import nn, ops
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.utils import (
-    ModulesToSaveWrapper,
-    _get_submodules,
-)
-
-from .tuners_utils import (
-    BaseTuner,
-    BaseTunerLayer,
-    check_adapters_to_merge,
-    check_target_module_exists,
-)
-
-
-@dataclass
-class LycorisConfig(PeftConfig):
-    r"""
-    A base config for LyCORIS like adapters
-    """
-    rank_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
-            )
-        },
-    )
-    alpha_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
-            )
-        },
-    )
-
-
-class LycorisLayer(BaseTunerLayer):
-    r"""
-    A base layer for LyCORIS like adapters
-    """
-    # adapter_layer_names needs to be defined on the child class
-    other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
-
-    def __init__(self, base_layer: nn.Module) -> None:
-        self.base_layer = base_layer
-        self.r = {}
-        self.alpha = {}
-        self.scaling = {}
-        self.rank_dropout = {}
-        self.module_dropout = {}
-
-        # Tuner info
-        self._disable_adapters = False
-        self.merged_adapters = []
-
-    @property
-    @abstractmethod
-    def _available_adapters(self) -> set[str]: ...
-
-    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
-        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
-        # model weights. The implementation is inspired by
-        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
-        # directly.
-        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
-        # omitting important logic inside that __init__.
-        kwargs = kwargs.copy()
-        cls.__init__(self, *args, device="meta", **kwargs)
-
-    @abstractmethod
-    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs): ...
-
-    # TODO: refactor LoRA to use the same approach
-    @abstractmethod
-    def _get_delta_activations(
-        self, adapter_name: str, x: mindspore.Tensor, *args: Any, **kwargs: Any
-    ) -> mindspore.Tensor:
-        """Activations added on top of the base layer output (i.e. after the base layer forward pass)"""
-    @abstractmethod
-    def get_delta_weight(self, adapter_name: str) -> mindspore.Tensor: ...
-
-    def merge(
-        self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
-    ) -> None:
-        """
-        Merge the active adapter weights into the base weights
-
-        Args:
-            safe_merge (`bool`, *optional*):
-                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
-                before merging the weights. This is useful if you want to check if the merge operation will produce
-                NaNs. Defaults to `False`.
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
-                Defaults to `None`.
-        """
-        adapter_names = check_adapters_to_merge(self, adapter_names)
-        if not adapter_names:
-            # no adapter to merge
-            return
-
-        for active_adapter in adapter_names:
-            if active_adapter in self._available_adapters:
-                base_layer = self.get_base_layer()
-                if safe_merge:
-                    orig_weights = base_layer.weight.data.clone()
-                    orig_weights += self.get_delta_weight(active_adapter)
-
-                    if not ops.isfinite(orig_weights).all():
-                        raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
-
-                    base_layer.weight.data = orig_weights
-                else:
-                    base_layer.weight.data += self.get_delta_weight(active_adapter)
-                self.merged_adapters.append(active_adapter)
-
-    @abstractmethod
-    def reset_adapter_parameters(self, adapter_name: str): ...
-
-    def set_scale(self, adapter, scale):
-        if adapter not in self._available_adapters:
-            # Ignore the case where the adapter is not in the layer
-            return
-        self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter]
-
-    def scale_layer(self, scale: float) -> None:
-        if scale == 1:
-            return
-
-        for active_adapter in self.active_adapters:
-            if active_adapter not in self._available_adapters:
-                continue
-
-            self.scaling[active_adapter] *= scale
-
-    def unmerge(self) -> None:
-        """
-        This method unmerges all merged adapter layers from the base weights.
-        """
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self._available_adapters:
-                self.get_base_layer().weight.data -= self.get_delta_weight(
-                    active_adapter
-                )
-
-    def unscale_layer(self, scale=None) -> None:
-        for active_adapter in self.active_adapters:
-            if active_adapter not in self._available_adapters:
-                continue
-
-            if scale is None:
-                self.scaling[active_adapter] = (
-                    self.alpha[active_adapter] / self.r[active_adapter]
-                )
-            else:
-                self.scaling[active_adapter] /= scale
-
-    @abstractmethod
-    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs): ...
-
-
-class LycorisTuner(BaseTuner):
-    r"""
-    A base tuner for LyCORIS like adapters
-    """
-    prefix: str
-    layers_mapping: dict[type[nn.Module], type[LycorisLayer]]
-
-    # def __init__(self, model, config, adapter_name):
-    #     super().__init__(model, config, adapter_name)
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.model, name)
-
-    @staticmethod
-    def _check_target_module_exists(config, key):
-        return check_target_module_exists(config, key)
-
-    @abstractmethod
-    def _create_and_replace(
-        self,
-        config: LycorisConfig,
-        adapter_name: str,
-        target: Union[LycorisLayer, nn.Module],
-        target_name,
-        parent,
-        current_key,
-    ): ...
-
-    @classmethod
-    def _create_new_module(
-        cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs
-    ) -> LycorisLayer:
-        # Find corresponding subtype of provided target module
-        new_module_cls = None
-        for subtype, target_cls in cls.layers_mapping.items():
-
-            if (
-                hasattr(target, "base_layer")
-                and isinstance(target.get_base_layer(), subtype)
-                and isinstance(target, BaseTunerLayer)
-            ):
-                # nested tuner layers are allowed
-                new_module_cls = target_cls
-                break
-            elif isinstance(target, subtype):
-                new_module_cls = target_cls
-                break
-
-        # We didn't find corresponding type, so adapter for this layer is not supported
-        if new_module_cls is None:
-            supported_modules = ", ".join(
-                layer.__name__ for layer in cls.layers_mapping.keys()
-            )
-            raise ValueError(
-                f"Target module of type {type(target)} not supported, "
-                f"currently only adapters for {supported_modules} are supported"
-            )
-
-        if isinstance(target, BaseTunerLayer):
-            target_base_layer = target.get_base_layer()
-        else:
-            target_base_layer = target
-        if isinstance(target_base_layer, nn.Conv2d):
-            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
-        elif isinstance(target_base_layer, nn.Linear):
-            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
-        else:
-            supported_modules = ", ".join(
-                layer.__name__ for layer in cls.layers_mapping.keys()
-            )
-            raise ValueError(
-                f"Target module of type {type(target)} not supported, "
-                f"currently only adapters for {supported_modules} are supported"
-            )
-
-        return new_module
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        for n, p in model.parameters_and_names():
-            if self.prefix not in n:
-                p.requires_grad = False
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        if peft_config.target_modules is None:
-            raise ValueError("Please specify `target_modules` in `peft_config`")
-        return peft_config
-
-    def _replace_module(self, parent, child_name, new_module, child):
-        setattr(parent, child_name, new_module)
-        # It's not necessary to set requires_grad here, as that is handled by
-        # _mark_only_adapters_as_trainable
-
-        if not hasattr(new_module, "base_layer"):
-            new_module.weight = child.weight
-            if hasattr(child, "bias"):
-                new_module.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_module, "base_layer"):
-                new_module.base_layer.state = child.state
-            else:
-                new_module.state = child.state
-
-    def _set_adapter_layers(self, enabled=True):
-        for module in self.model.modules():
-            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
-                module.enable_adapters(enabled)
-
-    def _unload_and_optionally_merge(
-        self,
-        merge: bool = True,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ):
-        if merge:
-            if getattr(self.model, "quantization_method", None) == "gptq":
-                raise ValueError(
-                    "Cannot merge LOHA layers when the model is gptq quantized"
-                )
-
-        self._unloading_checks(adapter_names)
-        key_list = [
-            key for key, _ in self.model.named_modules() if self.prefix not in key
-        ]
-        desc = "Unloading " + ("and merging " if merge else "") + "model"
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-
-            if hasattr(target, "base_layer"):
-                if merge:
-                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-                self._replace_module(
-                    parent, target_name, target.get_base_layer(), target
-                )
-            elif isinstance(target, ModulesToSaveWrapper):
-                # save any additional trainable modules part of `modules_to_save`
-                new_module = target.modules_to_save[target.active_adapter]
-                if hasattr(new_module, "base_layer"):
-                    # check if the module is itself a tuner layer
-                    if merge:
-                        new_module.merge(
-                            safe_merge=safe_merge, adapter_names=adapter_names
-                        )
-                    new_module = new_module.get_base_layer()
-                setattr(parent, target_name, new_module)
-
-        return self.model
-
-    def enable_adapter_layers(self) -> None:
-        """Enable all adapters.
-
-        Call this if you have previously disabled all adapters and want to re-enable them.
-        """
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self) -> None:
-        """Disable all adapters.
-
-        When disabling all adapters, the model output corresponds to the output of the base model.
-        """
-        self._set_adapter_layers(enabled=False)
-
-    def merge_and_unload(
-        self,
-        progressbar: bool = False,
-        safe_merge: bool = False,
-        adapter_names: Optional[list[str]] = None,
-    ) -> nn.Module:
-        r"""
-        This method merges the adapter layers into the base model. This is needed if someone wants to use the base
-        model as a standalone model.
-
-        Args:
-            progressbar (`bool`):
-                whether to show a progressbar indicating the unload and merge process
-            safe_merge (`bool`):
-                whether to activate the safe merging check to check if there is any potential Nan in the adapter
-                weights
-            adapter_names (`List[str]`, *optional*):
-                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
-                to `None`.
-
-        """
-        return self._unload_and_optionally_merge(
-            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
-        )
-
-    def unload(self) -> nn.Module:
-        """
-        Gets back the base model by removing all the lora modules without merging. This gives back the original base
-        model.
-        """
-        return self._unload_and_optionally_merge(merge=False)
-
-    def set_adapter(self, adapter_name: str | list[str]) -> None:
-        """Set the active adapter(s).
-
-        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.named_parameters():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
-        """
-        for module in self.model.modules():
-            if isinstance(module, LycorisLayer):
-                if module.merged:
-                    warnings.warn(
-                        "Adapter cannot be set when the model is merged. Unmerging the model first."
-                    )
-                    module.unmerge()
-                module.set_adapter(adapter_name)
-        self.active_adapter = adapter_name
-
-    def delete_adapter(self, adapter_name: str) -> None:
-        """
-        Deletes an existing adapter.
-
-        Args:
-            adapter_name (`str`): Name of the adapter to be deleted.
-        """
-        if adapter_name not in list(self.peft_config.keys()):
-            raise ValueError(f"Adapter {adapter_name} does not exist")
-        del self.peft_config[adapter_name]
-        key_list = [
-            key
-            for key, _ in self.model.parameters_and_names()
-            if self.prefix not in key
-        ]
-        new_adapter = None
-        for key in key_list:
-            _, target, _ = _get_submodules(self.model, key)
-            if isinstance(target, LycorisLayer):
-                target.delete_adapter(adapter_name)
-                if new_adapter is None:
-                    new_adapter = target.active_adapters[:]
-
-        self.active_adapter = new_adapter or []
diff --git a/mindnlp/peft/tuners/multitask_prompt_tuning/__init__.py b/mindnlp/peft/tuners/multitask_prompt_tuning/__init__.py
deleted file mode 100644
index 666c403c3..000000000
--- a/mindnlp/peft/tuners/multitask_prompt_tuning/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""multitask prompt tuning"""
-from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
-from .model import MultitaskPromptEmbedding
-
-
-__all__ = ["MultitaskPromptTuningConfig", "MultitaskPromptTuningInit", "MultitaskPromptEmbedding"]
diff --git a/mindnlp/peft/tuners/multitask_prompt_tuning/config.py b/mindnlp/peft/tuners/multitask_prompt_tuning/config.py
deleted file mode 100644
index 3c7622d5e..000000000
--- a/mindnlp/peft/tuners/multitask_prompt_tuning/config.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""multitask prompt tuning configs"""
-import enum
-from dataclasses import dataclass, field
-from typing import Optional, Union
-
-from ..prompt_tuning import PromptTuningConfig
-from ...utils import PeftType
-
-
-class MultitaskPromptTuningInit(str, enum.Enum):
-
-    """
-    This class represents a multitask prompt tuning initialization process. 
-    
-    It inherits from the built-in str class and the enum.Enum class, allowing instances of this class to have string-like behavior and access to enumeration functionality.
-    
-    MultitaskPromptTuningInit provides methods and attributes that are specific to the initialization process for multitask prompt tuning in a Python application.
-    
-    Attributes:
-        <list of attributes>
-    
-    Methods:
-        <list of methods>
-    
-    """
-    # initialize prompt with text
-    TEXT = "TEXT"
-    # initialize prompt with random matrix
-    RANDOM = "RANDOM"
-    # average the prefix and column matrices obtained during source training
-    AVERAGE_SOURCE_TASKS = "AVERAGE_SOURCE_TASKS"
-    # pick prefix and column matrices for a particular task obtained during source training
-    EXACT_SOURCE_TASK = "EXACT_SOURCE_TASK"
-    # only use the prompt embeddings trained during source training
-    ONLY_SOURCE_SHARED = "ONLY_SOURCE_SHARED"
-
-
-@dataclass
-class MultitaskPromptTuningConfig(PromptTuningConfig):
-
-    """
-    Represents a configuration class for multitask prompt tuning in a natural language processing model.
-    
-    This class inherits from PromptTuningConfig and provides additional configurations specifically for multitask prompt tuning. The class includes methods for initializing the configuration settings, such as
-setting the prompt type to MULTITASK_PROMPT_TUNING.
-    """
-    prompt_tuning_init: Union[MultitaskPromptTuningInit, str] = field(
-        default=MultitaskPromptTuningInit.RANDOM,
-        metadata={
-            "help": (
-                "How to initialize the prompt tuning parameters. Can be one of TEXT, RANDOM, AVERAGE_SOURCE_TASKS, "
-                "EXACT_SOURCE_TASK, ONLY_SOURCE_SHARED."
-            ),
-        },
-    )
-    prompt_tuning_init_state_dict_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The path of source state dict. This is required when training the downstream target prompt from "
-                "the pretrained source prompt"
-            ),
-        },
-    )
-    prompt_tuning_init_task: Optional[int] = field(default=0, metadata={"help": "source task id for initialization"})
-    num_ranks: Optional[int] = field(default=1, metadata={"help": "ranks"})
-    num_tasks: Optional[int] = field(default=1, metadata={"help": "number of tasks"})
-
-    def __post_init__(self):
-        """
-        This method is called immediately after the instance of the MultitaskPromptTuningConfig class is created.
-        
-        Args:
-            self: A reference to the instance of the class. It is automatically passed when the method is called. 
-        
-        Returns:
-            None. This method does not return anything.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        self.peft_type = PeftType.MULTITASK_PROMPT_TUNING
diff --git a/mindnlp/peft/tuners/multitask_prompt_tuning/model.py b/mindnlp/peft/tuners/multitask_prompt_tuning/model.py
deleted file mode 100644
index 0c0ba03f3..000000000
--- a/mindnlp/peft/tuners/multitask_prompt_tuning/model.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""multitask prompt tuning model"""
-import mindspore
-from mindnlp.core.nn import Parameter
-from mindnlp.core import ops
-
-from ..prompt_tuning import PromptEmbedding
-from ...utils import TaskType
-
-from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
-
-
-# This code is adapted for the paper: https://arxiv.org/abs/2303.02861 and
-# constitutes the work done at MIT-IBM Watson Research Lab.
-
-
-class MultitaskPromptEmbedding(PromptEmbedding):
-
-    """
-    Represents a multitask prompt embedding for natural language processing tasks.
-    
-    This class inherits from PromptEmbedding and provides functionality for forwarding multitask prompt embeddings using task-specific prefix embeddings.
-    
-    The class includes methods for initializing the multitask prompt embedding and forwarding the prompt embeddings for specific tasks.
-    
-    """
-    def __init__(self, config: MultitaskPromptTuningConfig, word_embeddings):
-        """
-        Initializes an instance of the MultitaskPromptEmbedding class.
-        
-        Args:
-            self: The instance of the class.
-            config (MultitaskPromptTuningConfig): The configuration object containing various settings for the prompt embedding.
-            word_embeddings: The word embeddings used for the prompt embedding.
-        
-        Returns:
-            None
-        
-        Raises:
-            ValueError: If the `prompt_tuning_init_state_dict_path` is not specified when using certain initialization methods.
-            FileNotFoundError: If the specified `prompt_tuning_init_state_dict_path` file is not found.
-            KeyError: If the required keys are not present in the state_dict.
-        """
-        super().__init__(config, word_embeddings)
-
-        self.num_tasks = config.num_tasks
-        self.num_ranks = config.num_ranks
-        self.num_virtual_tokens = config.num_virtual_tokens
-
-        self.num_transformer_submodules = config.num_transformer_submodules
-        if self.num_transformer_submodules is None:
-            self.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1
-
-        self.token_dim = config.token_dim
-
-        total_virtual_tokens = self.num_virtual_tokens * self.num_transformer_submodules
-
-        self.prefix_task_cols = Parameter(
-            ops.normal(
-                mean=0,
-                std=0.02,
-                size=(self.num_tasks, total_virtual_tokens, self.num_ranks),
-            )
-        )
-        self.prefix_task_rows = Parameter(
-            ops.normal(
-                mean=0,
-                std=0.02,
-                size=(self.num_tasks, self.num_ranks, self.token_dim),
-            )
-        )
-
-        if config.prompt_tuning_init in [
-            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
-            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
-            MultitaskPromptTuningInit.ONLY_SOURCE_SHARED,
-        ]:
-            if config.prompt_tuning_init_state_dict_path is None:
-                raise ValueError(
-                    f"prompt_tuning_init_state_dict_path needs to be specified with {config.prompt_tuning_init} "
-                    "init method"
-                )
-
-            if config.prompt_tuning_init_state_dict_path.endswith(".safetensors"):
-                from mindnlp.core.serialization import safe_load_file
-
-                state_dict: dict = safe_load_file(config.prompt_tuning_init_state_dict_path)
-            elif config.prompt_tuning_init_state_dict_path.endswith(".ckpt"):
-                state_dict = mindspore.load_checkpoint(config.prompt_tuning_init_state_dict_path)
-            else:
-                from mindnlp.core.serialization import load
-                state_dict: dict = load(
-                    config.prompt_tuning_init_state_dict_path,
-                )
-
-        if config.prompt_tuning_init in [
-            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
-            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
-        ]:
-            prefix_task_cols_: mindspore.Tensor = state_dict["prefix_task_cols"]
-            prefix_task_rows_: mindspore.Tensor = state_dict["prefix_task_rows"]
-
-            if config.prompt_tuning_init == MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS:
-                prefix_task_cols_ = prefix_task_cols_.mean(0, keep_dims=True)
-                prefix_task_rows_ = prefix_task_rows_.mean(0, keep_dims=True)
-            elif config.prompt_tuning_init == MultitaskPromptTuningInit.EXACT_SOURCE_TASK:
-                prefix_task_cols_ = prefix_task_cols_[config.prompt_tuning_init_task, ...].unsqueeze(0)
-                prefix_task_rows_ = prefix_task_rows_[config.prompt_tuning_init_task, ...].unsqueeze(0)
-
-            state_dict = {
-                "embedding.weight": state_dict["prompt_embeddings"],
-                "prefix_task_cols": prefix_task_cols_,
-                "prefix_task_rows": prefix_task_rows_,
-            }
-
-            self.load_state_dict(state_dict, strict=True)
-        elif config.prompt_tuning_init == MultitaskPromptTuningInit.ONLY_SOURCE_SHARED:
-            state_dict = {
-                "embedding.weight": state_dict["prompt_embeddings"],
-            }
-
-            self.load_state_dict(state_dict, strict=False)
-
-    def forward(self, indices, task_ids):
-        """
-        Construct prompt embeddings for multiple tasks.
-        
-        Args:
-            self (MultitaskPromptEmbedding): The instance of the MultitaskPromptEmbedding class.
-            indices (Tensor): A tensor containing indices for prompt embeddings.
-            task_ids (Tensor): A tensor containing task IDs for selecting specific tasks.
-        
-        Returns:
-            None. The method modifies the prompt_embeddings in-place.
-        
-        Raises:
-            ValueError: If task_ids is None.
-        """
-        if task_ids is None:
-            raise ValueError("task_ids cannot be None")
-
-        prompt_embeddings = self.embedding(indices)
-
-        task_cols = ops.index_select(self.prefix_task_cols, 0, task_ids)
-        task_rows = ops.index_select(self.prefix_task_rows, 0, task_ids)
-        task_prompts = ops.matmul(task_cols, task_rows)
-
-        prompt_embeddings *= task_prompts
-
-        return prompt_embeddings
diff --git a/mindnlp/peft/tuners/p_tuning/__init__.py b/mindnlp/peft/tuners/p_tuning/__init__.py
deleted file mode 100644
index 2cc599cc0..000000000
--- a/mindnlp/peft/tuners/p_tuning/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""p-tuning"""
-from .config import PromptEncoderConfig, PromptEncoderReparameterizationType
-from .model import PromptEncoder
-
-
-__all__ = ["PromptEncoder", "PromptEncoderConfig", "PromptEncoderReparameterizationType"]
diff --git a/mindnlp/peft/tuners/p_tuning/config.py b/mindnlp/peft/tuners/p_tuning/config.py
deleted file mode 100644
index e5ec0a812..000000000
--- a/mindnlp/peft/tuners/p_tuning/config.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""p-tuning config"""
-import enum
-from dataclasses import dataclass, field
-from typing import Union
-
-from ...config import PromptLearningConfig
-from ...utils import PeftType
-
-
-class PromptEncoderReparameterizationType(str, enum.Enum):
-
-    """
-    Represents a reparameterization type for prompt encoders in Python.
-    
-    This class, 'PromptEncoderReparameterizationType', is a subclass of both 'str' and 'enum.Enum', and it provides a way to define the reparameterization type for prompt encoders in Python.
-    
-    Attributes:
-        - LINEAR: Represents a linear reparameterization type.
-        - LOG: Represents a logarithmic reparameterization type.
-        - EXPONENTIAL: Represents an exponential reparameterization type.
-    
-    Usage:
-        To use this class, create an instance of 'PromptEncoderReparameterizationType' and specify the desired reparameterization type. The available reparameterization types are defined as class attributes:
-        
-            - PromptEncoderReparameterizationType.LINEAR
-            - PromptEncoderReparameterizationType.LOG
-            - PromptEncoderReparameterizationType.EXPONENTIAL
-        
-        Example usage:
-            reparam_type = PromptEncoderReparameterizationType.LINEAR
-            print(reparam_type)  # Output: 'LINEAR'
-            
-            reparam_type = PromptEncoderReparameterizationType.LOG
-            print(reparam_type)  # Output: 'LOG'
-            
-            reparam_type = PromptEncoderReparameterizationType.EXPONENTIAL
-            print(reparam_type)  # Output: 'EXPONENTIAL'
-    
-    Notes:
-        - This class inherits from 'str' and 'enum.Enum', providing all the functionalities of these base classes.
-        - The available reparameterization types are defined as class attributes and can be accessed using dot notation.
-        - The reparameterization type can be used to configure prompt encoders in various natural language processing tasks.
-    """
-    MLP = "MLP"
-    LSTM = "LSTM"
-
-
-@dataclass
-class PromptEncoderConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PromptEncoder`].
-
-    Args:
-        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
-            The type of reparameterization to use.
-        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
-        encoder_num_layers (`int`): The number of layers of the prompt encoder.
-        encoder_dropout (`float`): The dropout probability of the prompt encoder.
-    """
-    encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field(
-        default=PromptEncoderReparameterizationType.MLP,
-        metadata={"help": "How to reparameterize the prompt encoder"},
-    )
-    encoder_hidden_size: int = field(
-        default=None,
-        metadata={"help": "The hidden size of the prompt encoder"},
-    )
-    encoder_num_layers: int = field(
-        default=2,
-        metadata={"help": "The number of layers of the prompt encoder"},
-    )
-    encoder_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout of the prompt encoder"},
-    )
-
-    def __post_init__(self):
-        """
-        Method for initializing PromptEncoderConfig instances after creation.
-        
-        Args:
-            self: PromptEncoderConfig instance.
-                The instance of PromptEncoderConfig class to be initialized.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        self.peft_type = PeftType.P_TUNING
diff --git a/mindnlp/peft/tuners/p_tuning/model.py b/mindnlp/peft/tuners/p_tuning/model.py
deleted file mode 100644
index f506df59d..000000000
--- a/mindnlp/peft/tuners/p_tuning/model.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py
-# with some refactor
-"""p-tuning model"""
-import warnings
-
-from mindnlp.core import nn
-
-from .config import PromptEncoderConfig, PromptEncoderReparameterizationType
-
-
-class PromptEncoder(nn.Module):
-    """
-    The prompt encoder network that is used to generate the virtual token embeddings for p-tuning.
-
-    Args:
-        config ([`PromptEncoderConfig`]): The configuration of the prompt encoder.
-
-    Example:
-
-    ```py
-    >>> from peft import PromptEncoder, PromptEncoderConfig
-
-    >>> config = PromptEncoderConfig(
-    ...     peft_type="P_TUNING",
-    ...     task_type="SEQ_2_SEQ_LM",
-    ...     num_virtual_tokens=20,
-    ...     token_dim=768,
-    ...     num_transformer_submodules=1,
-    ...     num_attention_heads=12,
-    ...     num_layers=12,
-    ...     encoder_reparameterization_type="MLP",
-    ...     encoder_hidden_size=768,
-    ... )
-
-    >>> prompt_encoder = PromptEncoder(config)
-    ```
-
-    **Attributes**:
-        - **embedding** (`nn.Embedding`) -- The embedding layer of the prompt encoder.
-        - **mlp_head** (`nn.Sequential`) -- The MLP head of the prompt encoder if `inference_mode=False`.
-        - **lstm_head** (`nn.LSTM`) -- The LSTM head of the prompt encoder if `inference_mode=False` and
-        `encoder_reparameterization_type="LSTM"`.
-        - **token_dim** (`int`) -- The hidden embedding dimension of the base transformer model.
-        - **input_size** (`int`) -- The input size of the prompt encoder.
-        - **output_size** (`int`) -- The output size of the prompt encoder.
-        - **hidden_size** (`int`) -- The hidden size of the prompt encoder.
-        - **total_virtual_tokens** (`int`): The total number of virtual tokens of the
-        prompt encoder.
-        - **encoder_type** (Union[[`PromptEncoderReparameterizationType`], `str`]): The encoder type of the prompt
-          encoder.
-
-
-    Input shape: (`batch_size`, `total_virtual_tokens`)
-
-    Output shape: (`batch_size`, `total_virtual_tokens`, `token_dim`)
-    """
-    def __init__(self, config):
-        """
-        Initializes a PromptEncoder instance.
-        
-        Args:
-            self (PromptEncoder): The instance of the PromptEncoder class.
-            config (PromptEncoderConfig): An object containing configuration parameters for the PromptEncoder.
-                The configuration should include the following attributes:
-                    - token_dim (int): The dimensionality of the token embeddings.
-                    - encoder_hidden_size (int): The size of the hidden layer in the encoder.
-                    - num_virtual_tokens (int): The number of virtual tokens.
-                    - num_transformer_submodules (int): The number of transformer submodules.
-                    - encoder_reparameterization_type (PromptEncoderReparameterizationType): The type of encoder reparameterization.
-                    - encoder_dropout (float): The dropout rate for the encoder.
-                    - encoder_num_layers (int): The number of layers in the encoder.
-                    - inference_mode (bool): Flag indicating whether the model is in inference mode.
-        
-        Returns:
-            None. This method initializes the PromptEncoder instance with the provided configuration settings.
-        
-        Raises:
-            ValueError: If the encoder type specified in the configuration is not recognized. Accepted types are MLP or LSTM.
-            Warning: If the specified number of encoder layers is different from the default value when using the MLP encoder type.
-        """
-        super().__init__()
-        self.token_dim = config.token_dim
-        self.input_size = self.token_dim
-        self.output_size = self.token_dim
-        self.hidden_size = config.encoder_hidden_size
-        self.total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
-        self.encoder_type = config.encoder_reparameterization_type
-
-        # embedding
-        self.embedding = nn.Embedding(self.total_virtual_tokens, self.token_dim)
-        if not config.inference_mode:
-            if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
-                lstm_dropout = config.encoder_dropout
-                num_layers = config.encoder_num_layers
-                # LSTM
-                self.lstm_head = nn.LSTM(
-                    input_size=self.input_size,
-                    hidden_size=self.hidden_size,
-                    num_layers=num_layers,
-                    dropout=lstm_dropout,
-                    bidirectional=True,
-                    batch_first=True,
-                )
-
-                self.mlp_head = nn.Sequential(
-                    nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
-                    nn.ReLU(),
-                    nn.Linear(self.hidden_size * 2, self.output_size),
-                )
-
-            elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
-                encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers
-                if config.encoder_num_layers != encoder_num_layers_default:
-                    warnings.warn(
-                        f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. "
-                        f"Exactly {encoder_num_layers_default} MLP layers are used."
-                    )
-                layers = [
-                    nn.Linear(self.input_size, self.hidden_size),
-                    nn.ReLU(),
-                    nn.Linear(self.hidden_size, self.hidden_size),
-                    nn.ReLU(),
-                    nn.Linear(self.hidden_size, self.output_size),
-                ]
-                self.mlp_head = nn.Sequential(*layers)
-
-            else:
-                raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
-
-    def forward(self, indices):
-        """
-        Forward method in the PromptEncoder class.
-        
-        This method takes in two parameters, self and indices, and returns None.
-        
-        Args:
-            self: An instance of the PromptEncoder class.
-            indices (Tensor): A tensor containing the indices used for embedding lookup. The shape of the tensor should be (batch_size, sequence_length), where batch_size is the number of input sequences, and
-sequence_length is the length of each input sequence. Each element in the tensor represents the index of a word in the vocabulary.
-            
-        Returns:
-            output_embeds (Tensor): A tensor containing the output embeddings. The shape of the tensor depends on the encoder type. If the encoder_type is PromptEncoderReparameterizationType.LSTM, the shape
-will be (batch_size, sequence_length, embedding_size), where embedding_size is the size of the embedding vector. If the encoder_type is PromptEncoderReparameterizationType.MLP, the shape will be (batch_size,
-sequence_length, output_size), where output_size is the size of the output vector.
-        
-        Raises:
-            ValueError: If the encoder_type is not recognized. Please use either PromptEncoderReparameterizationType.MLP or PromptEncoderReparameterizationType.LSTM.
-        
-        """
-        input_embeds = self.embedding(indices)
-        if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
-            output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
-        elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
-            output_embeds = self.mlp_head(input_embeds)
-        else:
-            raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
-
-        return output_embeds
diff --git a/mindnlp/peft/tuners/poly/__init__.py b/mindnlp/peft/tuners/poly/__init__.py
deleted file mode 100644
index ea37000ff..000000000
--- a/mindnlp/peft/tuners/poly/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""poly."""
-
-from .config import PolyConfig
-from .layer import Dense, PolyLayer
-from .model import PolyModel
-
-
-__all__ = ["Dense", "PolyConfig", "PolyLayer", "PolyModel"]
diff --git a/mindnlp/peft/tuners/poly/config.py b/mindnlp/peft/tuners/poly/config.py
deleted file mode 100644
index d4bb96a8d..000000000
--- a/mindnlp/peft/tuners/poly/config.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""poly config"""
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-try:
-    from typing import Literal
-except:
-    from typing_extensions import Literal
-from mindnlp.peft.config import PeftConfig
-from mindnlp.peft.utils import PeftType
-
-
-@dataclass
-class PolyConfig(PeftConfig):
-    """
-    This is the configuration class to store the configuration of a [`PolyModel`].
-        - [Polytropon (Poly)](https://arxiv.org/abs/2202.13914)
-        - [Multi-Head Routing (MHR)](https://arxiv.org/abs/2211.03831)
-
-    Args:
-        r (`int`): Attention dimension of each Lora in Poly.
-        target_modules (`Union[List[str],str]`): The names of the modules to apply Poly to.
-        modules_to_save (`List[str]`): List of modules apart from Poly layers to be set as trainable
-            and saved in the final checkpoint.
-        init_weights (bool): Whether to perform initialization of Poly weights.
-        poly_type (`Literal["poly"]`): The variant of the Poly cell to use. Currently, only "poly"
-            is supported.
-        n_tasks (`int`): The number of tasks in a multitasking scenario.
-        n_skills (`int`): The number of skills (LoRA) in each Poly layer.
-        n_splits (`int`): The number of splits within each LoRA of a Poly layer. A value greater
-            than 1 indicates the use of Multi-Head Routing (MHR).
-    """
-
-    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
-    target_modules: Optional[Union[List[str], str]] = field(
-        default=None,
-        metadata={
-            "help": "List of cell names or regex expression of the cell names to replace with Poly."
-            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
-        },
-    )
-    modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={
-            "help": "List of modules apart from Poly layers to be set as trainable and saved in the final checkpoint. "
-            "For example, in Sequence Classification or Token Classification tasks, "
-            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
-        },
-    )
-    init_weights: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to initialize the weights of the Poly layers with their default initialization. Don't change "
-                "this setting, except if you know exactly what you're doing."
-            ),
-        },
-    )
-    poly_type: Literal["poly"] = field(
-        default="poly",
-        metadata={
-            "help": 'Type of Poly modules to be used. Currently only "poly" is supported.'
-        },
-    )
-    n_tasks: int = field(
-        default=1,
-        metadata={"help": "Number of tasks in multitasking scenario."},
-    )
-    n_skills: int = field(
-        default=4,
-        metadata={"help": "Number of skills (LoRA) in each Poly layer."},
-    )
-    n_splits: int = field(
-        default=1,
-        metadata={"help": "Number of splits within each LoRA of a Poly layer."},
-    )
-
-    def __post_init__(self):
-        self.peft_type = PeftType.POLY
-        self.target_modules = (
-            set(self.target_modules)
-            if isinstance(self.target_modules, list)
-            else self.target_modules
-        )
diff --git a/mindnlp/peft/tuners/poly/layer.py b/mindnlp/peft/tuners/poly/layer.py
deleted file mode 100644
index 2ba7e230a..000000000
--- a/mindnlp/peft/tuners/poly/layer.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""poly layer"""
-import math
-from typing import Any
-
-import mindspore
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import ParameterDict, Parameter
-from mindnlp.peft.tuners.tuners_utils import BaseTunerLayer
-
-from .config import PolyConfig
-from .router import get_router
-
-
-class PolyLayer(BaseTunerLayer):
-    # All names of layers that may contain (trainable) adapter weights
-    adapter_layer_names = ("poly_lora_A", "poly_lora_B", "poly_router")
-    # All names of other parameters that may contain adapter-related parameters
-    other_param_names = ("r", "n_tasks", "n_skills", "n_splits")
-
-    def __init__(self, base_layer: nn.Module, **kwargs):
-        self.base_layer = base_layer
-        self.r = {}
-        self.n_tasks = {}
-        self.n_skills = {}
-        self.n_splits = {}
-        self.poly_type = {}
-        self.poly_router = nn.ModuleDict()
-        self.poly_lora_A = ParameterDict()
-        self.poly_lora_B = ParameterDict()
-        self.kwargs = kwargs
-
-        base_layer = self.get_base_layer()
-        if isinstance(base_layer, nn.Linear):
-            in_features, out_features = base_layer.in_features, base_layer.out_features
-        else:
-            raise ValueError(f"Unsupported layer type {type(base_layer)}")
-
-        self.in_features = in_features
-        self.out_features = out_features
-
-    def update_layer(self, adapter_name, poly_config):
-        if poly_config.r <= 0:
-            raise ValueError(
-                f"`r` should be a positive integer value but the value passed is {poly_config.r}"
-            )
-
-        self.r[adapter_name] = poly_config.r
-        self.n_tasks[adapter_name] = poly_config.n_tasks
-        self.n_skills[adapter_name] = poly_config.n_skills
-        self.n_splits[adapter_name] = poly_config.n_splits
-        self.poly_type[adapter_name] = poly_config.poly_type
-
-        self.poly_lora_A[adapter_name] = Parameter(
-            ops.zeros(
-                poly_config.n_splits,
-                poly_config.n_skills,
-                self.in_features // poly_config.n_splits,
-                poly_config.r,
-            )
-        )
-        self.poly_lora_B[adapter_name] = Parameter(
-            ops.zeros(
-                poly_config.n_splits,
-                poly_config.n_skills,
-                poly_config.r,
-                self.out_features // poly_config.n_splits,
-            )
-        )
-        self.poly_router[adapter_name] = get_router(poly_config)
-
-        self.reset_poly_parameters(adapter_name, init_weights=poly_config.init_weights)
-
-        self.set_adapter(self.active_adapters)
-
-    def reset_poly_parameters(self, adapter_name, init_weights):
-        if adapter_name in self.poly_lora_A.keys():
-            # initialize A the same way as the default for nn.Linear
-            # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L269
-            n_splits, n_skills, d, r = self.poly_lora_A[adapter_name].shape
-            for skill in range(n_skills):
-                for split in range(n_splits):
-                    param = ops.empty((r, d))
-                    nn.init.kaiming_uniform_(param, a=math.sqrt(5))
-                    self.poly_lora_A[adapter_name].data[split, skill, :, :] = param.T
-
-            if init_weights:
-                # initialize B to zero
-                nn.init.zeros_(self.poly_lora_B[adapter_name])
-            else:
-                # initialize B the same way as the default for nn.Linear
-                n_splits, n_skills, r, d = self.poly_lora_B[adapter_name].shape
-                for skill in range(n_skills):
-                    for split in range(n_splits):
-                        param = ops.empty((d, r))
-                        nn.init.kaiming_uniform_(param, a=math.sqrt(5))
-                        self.poly_lora_B[adapter_name].data[split, skill, :, :] = param.T
-
-            # initialized router
-            self.poly_router[adapter_name].reset()
-
-
-class Dense(nn.Module, PolyLayer):
-    # Lora implemented in a dense layer
-    def __init__(
-        self,
-        base_layer,
-        adapter_name: str,
-        poly_config: PolyConfig,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        PolyLayer.__init__(self, base_layer, **kwargs)
-
-        self._active_adapter = adapter_name
-        self.update_layer(adapter_name, poly_config)
-
-    def forward(
-        self,
-        x: mindspore.Tensor,
-        *args: Any,
-        task_ids: mindspore.Tensor = None,
-        **kwargs: Any,
-    ) -> mindspore.Tensor:
-        previous_dtype = x.dtype
-        if self.disable_adapters:
-            result = self.base_layer(x, *args, **kwargs)
-        else:
-            result = self.base_layer(x, *args, **kwargs)
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.poly_lora_A.keys():
-                    continue
-
-                r = self.r[active_adapter]
-                poly_router = self.poly_router[active_adapter]
-                poly_lora_A = self.poly_lora_A[active_adapter]
-                poly_lora_B = self.poly_lora_B[active_adapter]
-
-                # Combine the output of LoRAs
-                # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L293
-                mixing_weights = poly_router(task_ids=task_ids, input_ids=x)
-                bs, n_splits, n_skills = mixing_weights.size()
-
-                # A is    n_splits, n_skills, D // n_splits, rank
-                # we want bs,       n_splits, D // n_splits, rank
-                A = ops.einsum("bqs,qsdr->bqdr", (mixing_weights, poly_lora_A))
-                B = ops.einsum("bqs,qsrd->bqrd", (mixing_weights, poly_lora_B))
-
-                A = A.reshape(bs, self.in_features, r)
-                B = B.transpose(1, 2).reshape(bs, r, self.out_features)
-
-                x = x.to(A.dtype)
-                result += x.bmm(A).bmm(B) / r
-
-        result = result.to(previous_dtype)
-        return result
-
-    def __repr__(self) -> str:
-        rep = super().__repr__()
-        return "poly." + rep
diff --git a/mindnlp/peft/tuners/poly/model.py b/mindnlp/peft/tuners/poly/model.py
deleted file mode 100644
index 203b28c4b..000000000
--- a/mindnlp/peft/tuners/poly/model.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""poly model"""
-
-
-from contextlib import contextmanager
-from dataclasses import asdict
-from enum import Enum
-from typing import Any
-
-from mindnlp.core import nn
-
-from mindnlp.peft.tuners.tuners_utils import (
-    BaseTuner,
-    BaseTunerLayer,
-    check_target_module_exists,
-)
-from mindnlp.peft.utils import (
-    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
-    ModulesToSaveWrapper,
-)
-
-from .config import PolyConfig
-from .layer import Dense, PolyLayer
-
-
-class PolyModel(BaseTuner):
-    prefix: str = "poly_"
-
-    # def __init__(self, model, config, adapter_name) -> None:
-    #     super().__init__(model, config, adapter_name)
-
-    @staticmethod
-    def _check_target_module_exists(poly_config, key):
-        return check_target_module_exists(poly_config, key)
-
-    def _create_and_replace(
-        self,
-        poly_config: PolyConfig,
-        adapter_name: str,
-        target: nn.Module,
-        target_name: str,
-        parent: nn.Module,
-        **optional_kwargs: Any,
-    ):
-        if isinstance(target, PolyLayer):
-            target.update_layer(adapter_name, poly_config)
-        else:
-            new_cell = self._create_new_cell(
-                poly_config,
-                adapter_name,
-                target,
-            )
-            if adapter_name not in self.active_adapters:
-                # adding an additional adapter: it is not automatically trainable
-                new_cell.requires_grad = False
-            self._replace_cell(parent, target_name, new_cell, target)
-
-    def _replace_cell(self, parent, child_name, new_cell, child):
-        setattr(parent, child_name, new_cell)
-        # It's not necessary to set requires_grad here, as that is handled by
-        # _mark_only_adapters_as_trainable
-
-        # child layer wraps the original cell, unpack it
-        if hasattr(child, "base_layer"):
-            child = child.base_layer
-
-        if not hasattr(new_cell, "base_layer"):
-            new_cell.weight = child.weight
-            if hasattr(child, "bias"):
-                new_cell.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            if hasattr(new_cell, "base_layer"):
-                new_cell.base_layer.state = child.state
-            else:
-                new_cell.state = child.state
-            # new_cell.to(child.weight.device)
-
-        # dispatch to correct device
-        # for name, cell in new_cell.parameters_and_names():
-        #     if (self.prefix in name) or ("ranknum" in name):
-        #         weight = child.qweight if hasattr(child, "qweight") else child.weight
-        #         cell.to(weight.device)
-
-    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
-        for name, cell in model.parameters_and_names():
-            if self.prefix not in name:
-                cell.requires_grad = False
-
-    @staticmethod
-    def _create_new_cell(poly_config, adapter_name, target, **kwargs):
-        if isinstance(target, BaseTunerLayer):
-            target_base_layer = target.get_base_layer()
-        else:
-            target_base_layer = target
-
-        if isinstance(target_base_layer, nn.Linear):
-            return Dense(target, adapter_name, poly_config, **kwargs)
-        else:
-            raise ValueError(
-                f"Target cell {target} is not supported. Currently, only the following modules are supported: "
-                "`nn.Linear`."
-            )
-
-    def __getattr__(self, name: str):
-        """Construct missing attributes to the wrapped cell."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def get_peft_config_as_dict(self, inference: bool = False):
-        for _, value in self.peft_config.items():
-            config = {
-                k: v.value if isinstance(v, Enum) else v
-                for k, v in asdict(value).items()
-            }
-            if inference:
-                config["inference_mode"] = True
-        return config
-
-    def _set_adapter_layers(self, enabled=True):
-        for cell in self.model.modules():
-            if isinstance(cell, (PolyLayer, ModulesToSaveWrapper)):
-                cell.enable_adapters(enabled)
-
-    def enable_adapter_layers(self):
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self):
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name):
-        for cell in self.model.modules():
-            if isinstance(cell, PolyLayer):
-                cell.set_adapter(adapter_name)
-
-    def _prepare_adapter_config(self, peft_config, model_config):
-        if peft_config.target_modules is None:
-            if (
-                model_config["model_type"]
-                not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
-            ):
-                raise ValueError("Please specify `target_modules` in `peft_config`")
-            peft_config.target_modules = set(
-                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[
-                    model_config["model_type"]
-                ]
-            )
-        return peft_config
-
-    def _register_pre_hooks(self, task_ids):
-        """Helper method to register pre hooks."""
-        if task_ids is None:
-            return []
-
-        def pre_hook(_, inputs):
-            args, kwargs = inputs
-            kwargs["task_ids"] = task_ids
-            return args, kwargs
-
-        handles = []
-        for cell in self.model.modules():
-            if isinstance(cell, Dense):
-                handle = cell.register_forward_pre_hook(pre_hook)
-                handles.append(handle)
-
-        return handles
-
-    @contextmanager
-    def _manage_pre_hooks(self, task_ids):
-        """Context manager to handle the lifecycle of pre hooks."""
-        handles = self._register_pre_hooks(task_ids)
-        try:
-            yield
-        finally:
-            for handle in handles:
-                handle.remove()
-
-    def forward(self, *args, task_ids=None, **kwargs):
-        with self._manage_pre_hooks(task_ids):
-            return self.model(*args, **kwargs)
-
-    def generate(self, *args, task_ids=None, **kwargs):
-        with self._manage_pre_hooks(task_ids):
-            return self.model.generate(*args, **kwargs)
diff --git a/mindnlp/peft/tuners/poly/router.py b/mindnlp/peft/tuners/poly/router.py
deleted file mode 100644
index 1878750a2..000000000
--- a/mindnlp/peft/tuners/poly/router.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""poly router"""
-from abc import ABC, abstractmethod
-
-import mindspore
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import Parameter
-from mindnlp.core.distributions.relaxed_bernoulli import RelaxedBernoulli
-from .config import PolyConfig
-
-EPS = 1e-12
-
-
-def get_router(poly_config: PolyConfig) -> nn.Module:
-    if poly_config.poly_type == "poly":
-        return PolyRouter(poly_config)
-    else:
-        raise ValueError(
-            f"Unsupported poly_type: {poly_config.poly_type}. "
-            "Currently, only the following types are supported: "
-            "`poly`."
-        )
-
-
-class Router(nn.Module, ABC):
-    @abstractmethod
-    def reset(self): ...
-
-    @abstractmethod
-    def forward(self, task_ids: mindspore.Tensor, input_ids: mindspore.Tensor): ...
-
-
-class PolyRouter(Router):
-    # It's a simplified implementation of
-    # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L138
-    def __init__(self, poly_config: PolyConfig):
-        super().__init__()
-
-        self.poly_type = poly_config.poly_type
-        self.n_tasks = poly_config.n_tasks
-        self.n_skills = poly_config.n_skills
-        self.n_splits = poly_config.n_splits
-
-        self.module_logits = Parameter(
-            ops.zeros(self.n_tasks, self.n_splits * self.n_skills)
-        )
-
-    def reset(self):
-        nn.init.uniform_(self.module_logits, -1e-3, 1e-3)
-
-    def forward(self, task_ids: mindspore.Tensor, input_ids: mindspore.Tensor):
-        if task_ids is None:
-            raise ValueError("task_ids should not be None.")
-        if task_ids.max().item() >= self.n_tasks:
-            raise ValueError(
-                f"Only {self.n_tasks} tasks available. Found task id = {task_ids.max().item()}"
-            )
-
-        # move task id to input's device
-        # task_ids = task_ids.to(self.module_logits.device)
-
-        module_logits = self.module_logits[task_ids]
-        module_logits = module_logits.view(-1, self.n_splits, self.n_skills)
-
-        if self.training:
-            module_logits = RelaxedBernoulli(
-                temperature=1.0, logits=module_logits
-            ).rsample()
-        else:
-            module_logits = ops.sigmoid(module_logits)
-
-        module_weights = module_logits / (module_logits.sum(dim=-1, keepdim=True) + EPS)
-
-        return module_weights
diff --git a/mindnlp/peft/tuners/prefix_tuning/__init__.py b/mindnlp/peft/tuners/prefix_tuning/__init__.py
deleted file mode 100644
index 2fd6e2d9b..000000000
--- a/mindnlp/peft/tuners/prefix_tuning/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""prefix tuning"""
-from .config import PrefixTuningConfig
-from .model import PrefixEncoder
-
-
-__all__ = ["PrefixTuningConfig", "PrefixEncoder"]
diff --git a/mindnlp/peft/tuners/prefix_tuning/config.py b/mindnlp/peft/tuners/prefix_tuning/config.py
deleted file mode 100644
index 00440499b..000000000
--- a/mindnlp/peft/tuners/prefix_tuning/config.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""prefix tuning config"""
-from dataclasses import dataclass, field
-
-from ...config import PromptLearningConfig
-from ...utils import PeftType
-
-
-@dataclass
-class PrefixTuningConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PrefixEncoder`].
-
-    Args:
-        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
-        prefix_projection (`bool`): Whether to project the prefix embeddings.
-    """
-    encoder_hidden_size: int = field(
-        default=None,
-        metadata={"help": "The hidden size of the encoder"},
-    )
-    prefix_projection: bool = field(
-        default=False,
-        metadata={"help": "Whether to project the prefix tokens"},
-    )
-
-    def __post_init__(self):
-        """
-        The '__post_init__' method is a special method in the 'PrefixTuningConfig' class that is automatically called after the initialization of a new instance of the class.
-        
-        Args:
-            self: An instance of the 'PrefixTuningConfig' class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        self.peft_type = PeftType.PREFIX_TUNING
diff --git a/mindnlp/peft/tuners/prefix_tuning/model.py b/mindnlp/peft/tuners/prefix_tuning/model.py
deleted file mode 100644
index 3c4c8affd..000000000
--- a/mindnlp/peft/tuners/prefix_tuning/model.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py
-# with some refactor
-"""prefix tuning model"""
-import mindspore
-from mindnlp.core import nn
-
-class PrefixEncoder(nn.Module):
-    r"""
-    The `mindspore.nn` model to encode the prefix.
-
-    Args:
-        config ([`PrefixTuningConfig`]): The configuration of the prefix encoder.
-
-    Example:
-
-    ```py
-    >>> from peft import PrefixEncoder, PrefixTuningConfig
-
-    >>> config = PrefixTuningConfig(
-    ...     peft_type="PREFIX_TUNING",
-    ...     task_type="SEQ_2_SEQ_LM",
-    ...     num_virtual_tokens=20,
-    ...     token_dim=768,
-    ...     num_transformer_submodules=1,
-    ...     num_attention_heads=12,
-    ...     num_layers=12,
-    ...     encoder_hidden_size=768,
-    ... )
-    >>> prefix_encoder = PrefixEncoder(config)
-    ```
-
-    **Attributes**:
-        - **embedding** (`mindspore.nn.Embedding`) -- The embedding layer of the prefix encoder.
-        - **transform** (`mindspore.nn.Sequential`) -- The two-layer MLP to transform the prefix embeddings if
-          `prefix_projection` is `True`.
-        - **prefix_projection** (`bool`) -- Whether to project the prefix embeddings.
-
-    Input shape: (`batch_size`, `num_virtual_tokens`)
-
-    Output shape: (`batch_size`, `num_virtual_tokens`, `2*layers*hidden`)
-    """
-    def __init__(self, config):
-        """
-        Initializes the PrefixEncoder class.
-        
-        Args:
-            self: The object instance.
-            config (object): A configuration object containing the following attributes:
-                - prefix_projection (bool): Indicates whether prefix projection should be applied.
-                - token_dim (int): The dimension of the token embedding.
-                - num_layers (int): The number of layers in the encoder.
-                - encoder_hidden_size (int): The size of the hidden state in the encoder.
-                - num_virtual_tokens (int): The number of virtual tokens.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            ValueError: If the prefix_projection attribute is True and the inference_mode attribute in the config object is not set.
-        """
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        token_dim = config.token_dim
-        num_layers = config.num_layers
-        encoder_hidden_size = config.encoder_hidden_size
-        num_virtual_tokens = config.num_virtual_tokens
-        if self.prefix_projection and not config.inference_mode:
-            # Use a two-layer MLP to encode the prefix
-            self.embedding = nn.Embedding(num_virtual_tokens, token_dim)
-            self.transform = nn.Sequential(
-                nn.Linear(token_dim, encoder_hidden_size),
-                nn.Tanh(),
-                nn.Linear(encoder_hidden_size, num_layers * 2 * token_dim),
-            )
-        else:
-            self.embedding = nn.Embedding(num_virtual_tokens, num_layers * 2 * token_dim)
-
-    def forward(self, prefix: mindspore.Tensor):
-        """
-        This method forwards the past key values based on the provided prefix for the PrefixEncoder.
-        
-        Args:
-            self (PrefixEncoder): The instance of the PrefixEncoder class.
-            prefix (mindspore.Tensor): The input prefix tensor used for forwarding past key values.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - TypeError: If the prefix is not of type mindspore.Tensor.
-            - ValueError: If the prefix projection is enabled and the prefix_tokens cannot be obtained or transformed.
-            - RuntimeError: If there is an issue with the embedding or transformation process.
-        """
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.transform(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
diff --git a/mindnlp/peft/tuners/prompt_tuning/__init__.py b/mindnlp/peft/tuners/prompt_tuning/__init__.py
deleted file mode 100644
index 320cd9c02..000000000
--- a/mindnlp/peft/tuners/prompt_tuning/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""prompt tuning"""
-from .config import PromptTuningConfig, PromptTuningInit
-from .model import PromptEmbedding
-
-
-__all__ = ["PromptTuningConfig", "PromptEmbedding", "PromptTuningInit"]
diff --git a/mindnlp/peft/tuners/prompt_tuning/config.py b/mindnlp/peft/tuners/prompt_tuning/config.py
deleted file mode 100644
index 95e889252..000000000
--- a/mindnlp/peft/tuners/prompt_tuning/config.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""prompt tuning config."""
-import enum
-from dataclasses import dataclass, field
-from typing import Optional, Union
-
-from ...config import PromptLearningConfig
-from ...utils import PeftType
-
-
-class PromptTuningInit(str, enum.Enum):
-
-    r"""
-    Represents an initialization state for prompt tuning in a Python class named 'PromptTuningInit'. 
-    This class inherits from the 'str' class and the 'enum.Enum' class.
-    
-    PromptTuningInit is used to define and manage the initialization state for prompt tuning. 
-    It provides functionality to set and retrieve the initialization state, and inherits 
-    all the methods and attributes of the 'str' class and the 'enum.Enum' class.
-    
-    Attributes:
-        - None
-    
-    Methods:
-        - None
-    
-    Inherited Attributes from the 'str' class:
-        - capitalize()
-        - casefold()
-        - center()
-        - count()
-        - encode()
-        - endswith()
-        - expandtabs()
-        - find()
-        - format()
-        - format_map()
-        - index()
-        - isalnum()
-        - isalpha()
-        - isascii()
-        - isdecimal()
-        - isdigit()
-        - isidentifier()
-        - islower()
-        - isnumeric()
-        - isprintable()
-        - isspace()
-        - istitle()
-        - isupper()
-        - join()
-        - ljust()
-        - lower()
-        - lstrip()
-        - maketrans()
-        - partition()
-        - replace()
-        - rfind()
-        - rindex()
-        - rjust()
-        - rpartition()
-        - rsplit()
-        - rstrip()
-        - split()
-        - splitlines()
-        - startswith()
-        - strip()
-        - swapcase()
-        - title()
-        - translate()
-        - upper()
-        - zfill()
-    
-    Inherited Attributes from the 'enum.Enum' class:
-        - name
-        - value
-    
-    Inherited Methods from the 'enum.Enum' class:
-        - __class__
-        - __contains__
-        - __delattr__
-        - __dir__
-        - __eq__
-        - __format__
-        - __ge__
-        - __getattribute__
-        - __getitem__
-        - __gt__
-        - __hash__
-        - __init__
-        - __init_subclass__
-        - __iter__
-        - __le__
-        - __len__
-        - __lt__
-        - __members__
-        - __module__
-        - __ne__
-        - __new__
-        - __reduce__
-        - __reduce_ex__
-        - __repr__
-        - __setattr__
-        - __sizeof__
-        - __str__
-        - __subclasshook__
-    
-    """
-    TEXT = "TEXT"
-    RANDOM = "RANDOM"
-
-
-@dataclass
-class PromptTuningConfig(PromptLearningConfig):
-    """
-    This is the configuration class to store the configuration of a [`PromptEmbedding`].
-
-    Args:
-        prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding.
-        prompt_tuning_init_text (`str`, *optional*):
-            The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`.
-        tokenizer_name_or_path (`str`, *optional*):
-            The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`.
-        tokenizer_kwargs (`dict`, *optional*):
-            The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if `prompt_tuning_init` is
-            `TEXT`.
-    """
-    prompt_tuning_init: Union[PromptTuningInit, str] = field(
-        default=PromptTuningInit.RANDOM,
-        metadata={"help": "How to initialize the prompt tuning parameters"},
-    )
-    prompt_tuning_init_text: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
-        },
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
-        },
-    )
-
-    tokenizer_kwargs: Optional[dict] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if prompt_tuning_init is "
-                "`TEXT`"
-            ),
-        },
-    )
-
-    def __post_init__(self):
-        r"""
-        This method initializes the PromptTuningConfig object after its creation.
-        
-        Args:
-            self: The instance of the PromptTuningConfig class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            - ValueError: If the prompt_tuning_init is set to TEXT and tokenizer_name_or_path is not provided.
-            - ValueError: If the prompt_tuning_init is set to TEXT and prompt_tuning_init_text is not provided.
-            - ValueError: If tokenizer_kwargs is provided but prompt_tuning_init is not set to TEXT.
-        """
-        self.peft_type = PeftType.PROMPT_TUNING
-        if (self.prompt_tuning_init == PromptTuningInit.TEXT) and not self.tokenizer_name_or_path:
-            raise ValueError(
-                f"When prompt_tuning_init='{PromptTuningInit.TEXT.value}', "
-                f"tokenizer_name_or_path can't be {self.tokenizer_name_or_path}."
-            )
-        if (self.prompt_tuning_init == PromptTuningInit.TEXT) and self.prompt_tuning_init_text is None:
-            raise ValueError(
-                f"When prompt_tuning_init='{PromptTuningInit.TEXT.value}', "
-                f"prompt_tuning_init_text can't be {self.prompt_tuning_init_text}."
-            )
-        if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT):
-            raise ValueError(
-                f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT.value}'."
-            )
diff --git a/mindnlp/peft/tuners/prompt_tuning/model.py b/mindnlp/peft/tuners/prompt_tuning/model.py
deleted file mode 100644
index ec678f5ca..000000000
--- a/mindnlp/peft/tuners/prompt_tuning/model.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""prompt tuning model"""
-import math
-
-import mindspore
-from mindnlp.core.nn import Parameter
-from mindnlp.core import nn
-from .config import PromptTuningInit
-
-class PromptEmbedding(nn.Module):
-    """
-    The model to encode virtual tokens into prompt embeddings.
-
-    Args:
-        config ([`PromptTuningConfig`]): The configuration of the prompt embedding.
-        word_embeddings (`nn.Module`): The word embeddings of the base transformer model.
-
-    **Attributes**:
-        - **embedding** (`nn.Embedding`) -- The embedding layer of the prompt embedding.
-
-    Example:
-
-    ```py
-    >>> from peft import PromptEmbedding, PromptTuningConfig
-
-    >>> config = PromptTuningConfig(
-    ...     peft_type="PROMPT_TUNING",
-    ...     task_type="SEQ_2_SEQ_LM",
-    ...     num_virtual_tokens=20,
-    ...     token_dim=768,
-    ...     num_transformer_submodules=1,
-    ...     num_attention_heads=12,
-    ...     num_layers=12,
-    ...     prompt_tuning_init="TEXT",
-    ...     prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral",
-    ...     tokenizer_name_or_path="t5-base",
-    ... )
-
-    >>> # t5_model.shared is the word embeddings of the base model
-    >>> prompt_embedding = PromptEmbedding(config, t5_model.shared)
-    ```
-
-    Input Shape: (`batch_size`, `total_virtual_tokens`)
-
-    Output Shape: (`batch_size`, `total_virtual_tokens`, `token_dim`)
-    """
-    def __init__(self, config, word_embeddings):
-        r"""
-        Initialize the PromptEmbedding class.
-        
-        Args:
-            self: Reference to the current instance of the class.
-            config (object): Configuration object containing various settings.
-                - num_virtual_tokens (int): Number of virtual tokens.
-                - num_transformer_submodules (int): Number of transformer submodules.
-                - token_dim (int): Dimensionality of the token embeddings.
-                - prompt_tuning_init (Enum): Specifies the type of prompt tuning initialization.
-                - inference_mode (bool): Indicates if the model is in inference mode.
-                - tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer.
-                - tokenizer_name_or_path (str): Name or path of the pretrained tokenizer.
-                - prompt_tuning_init_text (str): Text used for prompt tuning initialization.
-            word_embeddings (object): Word embeddings for initializing the embedding layer.
-        
-        Returns:
-            None. The method initializes the embedding layer with the provided word embeddings.
-        
-        Raises:
-            ImportError: If the transformers module cannot be imported.
-            ValueError: If the number of text tokens exceeds the total virtual tokens.
-            TypeError: If the word embedding weights cannot be converted to float32.
-        """
-        super().__init__()
-
-        total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
-        self.embedding = nn.Embedding(total_virtual_tokens, config.token_dim)
-        if config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode:
-            from ....transformers import AutoTokenizer
-
-            tokenizer_kwargs = config.tokenizer_kwargs or {}
-            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path, **tokenizer_kwargs)
-            init_text = config.prompt_tuning_init_text
-            init_token_ids = tokenizer(init_text)["input_ids"]
-            # Trim or iterate until num_text_tokens matches total_virtual_tokens
-            num_text_tokens = len(init_token_ids)
-            if num_text_tokens > total_virtual_tokens:
-                init_token_ids = init_token_ids[:total_virtual_tokens]
-            elif num_text_tokens < total_virtual_tokens:
-                num_reps = math.ceil(total_virtual_tokens / num_text_tokens)
-                init_token_ids = init_token_ids * num_reps
-            init_token_ids = init_token_ids[:total_virtual_tokens]
-            init_token_ids = mindspore.tensor(init_token_ids)
-            word_embedding_weights = word_embeddings(init_token_ids).copy()
-            word_embedding_weights = word_embedding_weights.to(mindspore.float32)
-            self.embedding.weight = Parameter(word_embedding_weights)
-
-    def forward(self, indices):
-        r"""
-        Construct the prompt embeddings based on the given indices.
-        
-        Args:
-            self (PromptEmbedding): An instance of the PromptEmbedding class.
-            indices (int): The indices used to retrieve the prompt embeddings.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        # Just get embeddings
-        prompt_embeddings = self.embedding(indices)
-        return prompt_embeddings
diff --git a/mindnlp/peft/tuners/tuners_utils.py b/mindnlp/peft/tuners/tuners_utils.py
deleted file mode 100644
index c9532a720..000000000
--- a/mindnlp/peft/tuners/tuners_utils.py
+++ /dev/null
@@ -1,814 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-BaseTuner class and BaseTunerLayer class.
-"""
-from __future__ import annotations
-import re
-import logging
-import warnings
-import copy
-from typing import Any, Optional, Union
-from abc import ABC
-from contextlib import contextmanager
-from mindspore import Tensor
-from mindnlp.core import nn
-
-from ..config import PeftConfig
-from ..utils import _get_submodules
-
-logger = logging.getLogger(__name__)
-
-
-@contextmanager
-def onload_layer(layer):
-    r"""
-    A utility for modifying a cell containing one or more tuners and a base layer, any of which are offloaded to the
-    CPU or disk. Moves a cell's sub-modules to the execution device before some action is performed, after that the
-    base layer state dictionary is re-assigned (if that layer was offloaded to the disk) and finally the parameters are
-    offloaded.
-
-    If the cell has no offloaded sub-modules, this function does nothing.
-
-    Args:
-        layer ('mindspore.nn.Module'):
-            layer with tuners to be merged
-    """
-    offloaded_modules = []
-    for name, module in layer.named_modules():
-        if name in ["", "base_layer"]:
-            continue
-        # if hasattr(cell, "_hf_hook") and isinstance(cell._hf_hook, AlignDevicesHook) and cell._hf_hook.offload:
-        #     cell._hf_hook.pre_forward(cell)
-        #     offloaded_modules.append(cell)
-
-    # base_layer_offload = False
-    # if hasattr(layer, "base_layer") and (
-    #     hasattr(layer.base_layer, "_hf_hook")
-    #     and isinstance(layer.base_layer._hf_hook, AlignDevicesHook)
-    #     and layer.base_layer._hf_hook.offload
-    # ):
-    #     # check if the base layer is disk-offloaded (must contain a 'dataset' and an offload index)
-    #     if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values() and hasattr(
-    #         layer.base_layer._hf_hook.weights_map, "dataset"
-    #     ):
-    #         # find the disk-offload index (maps modules to safetensors) from the `dataset` (OffloadedWeightsLoader object)
-    #         index = layer.base_layer._hf_hook.weights_map.dataset.index
-    #         module_name = list(dict(layer.base_layer._hf_hook.weights_map.dataset).keys())[0]  # any cell will do
-    #         file_name = index[module_name]["safetensors_file"]
-    #         base_name_arr = []
-    #         # get effective dir name
-    #         for i in os.path.split(file_name):
-    #             if "--" in i:
-    #                 base_name_arr.append(i)
-    #                 break
-    #             base_name_arr.append(i)
-    #         base_name = os.path.join(*base_name_arr)
-    #         safetensors_filename = base_name + "-merged"
-    #     layer.base_layer._hf_hook.pre_forward(layer.base_layer)
-    #     base_layer_offload = True
-
-    # yield
-
-    # for cell in offloaded_modules:
-    #     cell._hf_hook.post_forward(cell, torch.tensor([]))
-
-    # if base_layer_offload:
-    #     # re-make weights map (must be on cpu to send params to the disk via memmap if disk offload)
-    #     layer.base_layer._hf_hook.weights_map = {
-    #         name: param.to("cpu") for name, param in named_module_tensors(layer.base_layer)
-    #     }
-    #     # offload weights map to disk if original device is the disk
-    #     if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values() and hasattr(
-    #         layer.base_layer._hf_hook.weights_map, "dataset"
-    #     ):
-    #         # rewrite directory with merged weights
-    #         offload_state_dict(safetensors_filename, layer.base_layer._hf_hook.weights_map)
-    #     layer.base_layer._hf_hook.post_forward(layer.base_layer, torch.tensor([]))
-
-
-class BaseTuner(nn.Module):
-    r"""
-    A base tuner model that provides the common methods and attributes for all tuners that are injectable into a
-    torch.nn.Module
-
-    For adding a new Tuner class, one needs to overwrite the following methods:
-
-    - **_prepare_adapter_config**:
-        A private method to eventually prepare the adapter config, for example in case the field `target_modules` is
-        missing.
-    - **_check_new_adapter_config**:
-        A helper private method to check if the passed cell's key name matches any of the target modules in the
-        adatper_config.
-    - **_create_and_replace**:
-        A private method to create and replace the target cell with the adapter cell.
-    - **_check_target_module_exists**:
-        A private helper method to check if the passed cell's key name matches any of the target modules in the
-        adatper_config.
-
-    The easiest is to check what is done in the `peft.tuners.lora.LoraModel` class.
-
-    Attributes:
-        model (`mindspore.nn.Module`):
-            The model to which the adapter tuner layers will be attached.
-        forward (`Callable`):
-            The forward method of the model.
-        peft_config (`Union[`PeftConfig`, dict[str, PeftConfig]]`):
-            The adapter configuration object, it should be a dictionary of `str` to `PeftConfig` objects. One can also
-            pass a PeftConfig object and a new adapter will be created with the default name `adapter` or create a new
-            dictionary with a key `adapter_name` and a value of that peft config.
-        config (`dict[str, Any]`):
-            The model configuration object, it should be a dictionary of `str` to `Any` objects.
-    """
-    def __init__(self, model, peft_config: Union[PeftConfig, dict[str, PeftConfig]], adapter_name: str) -> None:
-        r"""
-        __init__
-        
-        Initializes an instance of the BaseTuner class.
-        
-        Args:
-        - self: The instance of the class.
-        - model: The model to be tuned.
-        - peft_config: A Union of PeftConfig or a dictionary of adapter names to PeftConfig objects. It specifies the configuration for the adapter.
-        - adapter_name: A string representing the name of the adapter.
-        
-        Returns:
-        None. The method initializes the instance of the BaseTuner class.
-        
-        Raises:
-        - AttributeError: If the 'peft_config' attribute is already found in the model, indicating the presence of multiple adapters in the model.
-        - TypeError: If the 'peft_config' parameter is not of type PeftConfig or dictionary of adapter names to PeftConfig objects.
-        """
-        super().__init__()
-        # self.peft_config = config
-        # self.add_adapter(adapter_name, self.peft_config[adapter_name])
-
-        self.model = model
-
-        # For advanced developers, if you want to attach multiple adapters to your
-        # model, just add a `peft_config` dict attribute to your model.
-        if not hasattr(self, "peft_config"):
-            self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config
-        else:
-            logger.info(
-                "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters"
-                " in the model. Make sure to know what you are doing!"
-            )
-            if isinstance(peft_config, PeftConfig):
-                self.peft_config[adapter_name] = peft_config
-            else:
-                # user is adding a dict of PeftConfigs
-                self.peft_config.update(peft_config)
-
-        # transformers models have a .config attribute, whose presence is assumed later on
-        # if not hasattr(self, "config"):
-        #     self.config = {"model_type": "custom"}
-
-        self.active_adapter: str | list[str] = adapter_name
-        self.inject_adapter(self.model, adapter_name)
-
-        # Copy the peft_config in the injected model.
-        self.model.peft_config = self.peft_config
-
-    @property
-    def active_adapters(self) -> list[str]:
-        r"""
-        Method to retrieve the active adapters.
-        
-        Args:
-            self: BaseTuner object. The instance of the BaseTuner class.
-            
-        Returns:
-            list[str]: A list of active adapters. If the active_adapter attribute is a string, it is returned as a single-element list. 
-            Otherwise, the active_adapter attribute itself is returned.
-            
-        Raises:
-            None
-        """
-        if isinstance(self.active_adapter, str):
-            return [self.active_adapter]
-        # is already a list of str
-        return self.active_adapter
-
-    def forward(self, *args: Any, **kwargs: Any):
-        r"""
-        This method forwards an instance of the BaseTuner class.
-        
-        Args:
-            self: The instance of the BaseTuner class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None. This method does not raise any exceptions.
-        """
-        return self.model.forward(*args, **kwargs)
-
-    def _prepare_adapter_config(self, peft_config: PeftConfig, model_config: dict) -> PeftConfig:
-        r"""
-        A private method to eventually prepare the adapter config. For transformers based models, if
-        `peft_config.target_modules` is None, we can automatically infer the target modules from the
-        `TRANSFORMERS_MODELS_TO_XXX_TARGET_MODULES_MAPPING`. This method can be further refactored in the future to
-        automatically infer it for all tuner models.
-
-        Check out `peft.tuner.lora.LoraModel._prepare_adapter_config` for an example.
-
-        Args:
-            peft_config (`str`):
-                The adapter config.
-            model_config (`str`):
-                The transformers model config, that config should contain the `model_type` key.
-        """
-        return None
-
-    @staticmethod
-    def _check_target_module_exists(peft_config: PeftConfig, key: str) -> bool:
-        r"""
-        A helper private method to check if the passed cell's key name matches any of the target modules in the
-        `peft_config.target_modules` list. If it does, return `True`, else return `False`.
-
-        Args:
-            peft_config (`PeftConfig`):
-                The adapter config.
-            key (`str`):
-                The cell's key name.
-        """
-    def _create_and_replace(
-        self,
-        peft_config: PeftConfig,
-        adapter_name: str,
-        target: nn.Module,
-        target_name: str,
-        parent: nn.Module,
-        current_key: str,
-    ) -> None:
-        r"""
-        Inplace replacement of the target cell with the adapter layer. This method needs to be overriden by all the
-        tuner classes.
-
-        Check `peft.tuners.lora.LoraModel._create_and_replace` for an example.
-
-        Args:
-            peft_config (`PeftConfig`):
-                The adapter config.
-            adapter_name (`str`):
-                The adapter name.
-            target (`nn.Module`):
-                The target cell.
-            target_name (`str`):
-                The target cell's name.
-            parent (`nn.Module`):
-                The parent cell.
-            **optionnal_kwargs (`dict`):
-                The optional keyword arguments to pass to deal with particular cases (e.g. 8bit, 4bit quantization)
-        """
-    def _mark_only_adapters_as_trainable(self, model):
-        r"""
-        A helper method to mark only the adapter layers as trainable (i.e. cell.requires_grad = False) This needs to
-        be overriden for all tuner classes to match the correct key names.
-
-        Check `peft.tuners.lora.LoraModel._mark_only_adapters_as_trainable` for an example.
-        """
-    def _check_new_adapter_config(self, config: PeftConfig) -> None:
-        """
-        A helper method to check the config when a new adapter is being added.
-
-        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
-
-        """
-    # def add_adapter(self, adapter_name, config=None):
-    #     """add adapter"""
-    #     if config is not None:
-    #         model_config = self.model.config.to_dict() if hasattr(self.model.config, "to_dict") else self.model.config
-    #         config = self._prepare_lora_config(config, model_config)
-    #         self.peft_config[adapter_name] = config
-
-    #     self._find_and_replace(adapter_name)
-
-    #     if len(self.peft_config) > 1 and self.peft_config[adapter_name].bias != "none":
-    #         raise ValueError(
-    #             "LoraModel supports only 1 adapter with bias. When using multiple adapters, set bias to 'none' for all adapters."
-    #         )
-    #     # only lora trainable
-    #     self._mark_only_adapters_as_trainable()
-    #     if self.peft_config[adapter_name].inference_mode:
-    #         # freeze adapter
-    #         _freeze_adapter(self.model, adapter_name)
-    def inject_adapter(self, model: nn.Module, adapter_name: str):
-        r"""
-        Creates adapter layers and replaces the target modules with the adapter layers. This method is called under the
-        hood by `peft.mapping.get_peft_model` if a non-prompt tuning adapter class is passed, e.g. LoRA.
-
-        The corresponding PEFT config is directly retrieved from the `peft_config` attribute of the BaseTuner class.
-        Rename add_adapter -> inject_adapter.
-
-        Args:
-            model (`nn.Module`):
-                The model to be tuned.
-            adapter_name (`str`):
-                The adapter name.
-        """
-        peft_config = self.peft_config[adapter_name]
-        # Note: If possible, all checks should be performed *at the start of this method*.
-        # This way, we can raise early if something goes wrong, without leaving the model
-        # in a bad (half-initialized) state.
-        self._check_new_adapter_config(peft_config)
-
-        is_target_modules_in_base_model = False
-        key_list = [key for key, _ in model.named_modules()]  # named_modules
-
-        model_config = getattr(model, "config", {"model_type": "custom"})
-        if hasattr(model_config, "to_dict"):
-            model_config = model_config.to_dict()
-
-        peft_config = self._prepare_adapter_config(peft_config, model_config) # pylint: disable=assignment-from-none
-        for key in key_list:
-            if not self._check_target_module_exists(peft_config, key):
-                continue
-
-            is_target_modules_in_base_model = True
-            parent, target, target_name = _get_submodules(model, key)
-
-            optionnal_kwargs = {
-                "loaded_in_8bit": getattr(model, "is_loaded_in_8bit", False),  
-                "loaded_in_4bit": getattr(model, "is_loaded_in_4bit", False),
-                "current_key": key,
-            }
-            # **finally create or replace target cell.**
-            self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
-
-        if not is_target_modules_in_base_model:
-            raise ValueError(
-                f"Target modules {peft_config.target_modules} not found in the base model. "
-                f"Please check the target modules and try again."
-            )
-
-        self._mark_only_adapters_as_trainable(model)
-
-        if self.peft_config[adapter_name].inference_mode:
-            for name, param in self.model.parameters_and_names():
-                if adapter_name in name:
-                    param.requires_grad = False
-
-    def merge_adapter(self):
-        """
-        This method merges the LoRa layers into the base model.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, BaseTunerLayer):
-                cell.merge()
-
-    def unmerge_adapter(self):
-        """
-        This method unmerges the LoRa layers from the base model.
-        """
-        for cell in self.model.modules():
-            if isinstance(cell, BaseTunerLayer):
-                cell.unmerge()
-
-
-class BaseTunerLayer(ABC):
-    r"""
-    A tuner layer mixin that provides the common methods and attributes for all tuners.
-
-    Args:
-        is_pluggable (`bool`, *optional*):
-            Whether the adapter layer can be plugged to any pytorch cell
-        active_adapters (Union[List[`str`], `str`], *optional*):
-            The name of the active adapter.
-    """
-    # All names of layers that may contain adapter (trainable) weights
-    adapter_layer_names: tuple[str] = ()
-    # All names of other parameters that may contain adapter-related parameters
-    other_param_names: tuple[str] = ()
-
-    # indicates whether all adapters should be disabled
-    _disable_adapters: bool = False
-
-    # the currently active adapter(s)
-    _active_adapter: str | list[str] = "default"
-
-    # List all merged adapters
-    merged_adapters: list[str] = []
-
-    def get_base_layer(self) -> nn.Module:
-        """
-        (Recursively) get the base_layer.
-
-        This is necessary for the case that the tuner layer wraps another tuner layer.
-
-        """
-        base_layer = self
-        while hasattr(base_layer, "base_layer"):
-            base_layer = base_layer.base_layer
-        return base_layer
-
-    @property
-    def weight(self) -> Tensor:
-        r"""
-        Returns the weight of the base layer.
-        
-        Args:
-            self: The instance of the BaseTunerLayer class.
-        
-        Returns:
-            A Tensor object representing the weight of the base layer.
-        
-        Raises:
-            None.
-        """
-        # This is required for some transformers code, e.g. for T5, weight is accessed as:
-        #     self.wo.weight
-        # where "wo" is the adapter layer.
-        # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers
-        # /models/t5/modeling_t5.py#L292
-        base_layer = self.get_base_layer()
-        weight = base_layer.weight
-        return weight
-
-    @property
-    def bias(self) -> Tensor:
-        r"""
-        This method retrieves the bias tensor from the base layer.
-        
-        Args:
-            self: An instance of the BaseTunerLayer class.
-        
-        Returns:
-            Tensor: The bias tensor obtained from the base layer.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        base_layer = self.get_base_layer()
-        return base_layer.bias
-
-    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-        r"""
-        Merge the current layer with other layers.
-        
-        Args:
-            self (BaseTunerLayer): The instance of the BaseTunerLayer class.
-            safe_merge (bool): A flag indicating whether to perform a safe merge. Defaults to False.
-            adapter_names (Optional[list[str]]): A list of adapter names. Defaults to None.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            NotImplementedError: If the method is called without being implemented.
-        """
-        raise NotImplementedError
-
-    def unmerge(self) -> None:
-        r"""
-        unmerge(self)
-            This method unmerges the current instance of BaseTunerLayer.
-        
-        Args:
-            self: BaseTunerLayer - The instance of BaseTunerLayer to be unmerged.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            NotImplementedError: If the method is called, a NotImplementedError is raised as this method is not implemented.
-        """
-        raise NotImplementedError
-
-    @property
-    def merged(self) -> bool:
-        r"""
-        Returns whether the current instance of the BaseTunerLayer class has merged adapters.
-        
-        Args:
-            self (BaseTunerLayer): The current instance of the BaseTunerLayer class.
-        
-        Returns:
-            bool: A boolean value indicating whether the current instance has merged adapters. 
-            Returns True if there are merged adapters, and False otherwise.
-        
-        Raises:
-            None.
-        
-        """
-        return bool(self.merged_adapters)
-
-    @property
-    def disable_adapters(self) -> bool:
-        r"""
-        Disables the adapters in the BaseTunerLayer.
-        
-        Args:
-            self: An instance of the BaseTunerLayer class.
-        
-        Returns:
-            bool: Returns a boolean value indicating whether the adapters were successfully disabled.
-        
-        Raises:
-            None.
-        
-        This method disables the adapters in the BaseTunerLayer. Adapters are components that allow communication between different systems or modules. By disabling the adapters, the BaseTunerLayer restricts
-any further communication or interaction with external systems.
-        
-        Note:
-            The disable_adapters method does not remove or delete the adapters from the BaseTunerLayer instance. It only disables their functionality temporarily. To enable the adapters again, use the
-enable_adapters method.
-        
-        Example:
-            >>> tuner_layer = BaseTunerLayer()
-            >>> tuner_layer.disable_adapters()
-            True
-        """
-        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
-        return self._disable_adapters
-
-    @property
-    def active_adapter(self) -> str | list[str]:
-        r"""Return the active adapter.
-        
-        This method is a property of the BaseTunerLayer class. It returns the active adapter, which can be either a string or a list of strings.
-        
-        Args:
-            self: An instance of the BaseTunerLayer class.
-        
-        Returns:
-            str | list[str]: The active adapter. If there is only one active adapter, it is returned as a string. If there are multiple active adapters, they are returned as a list of strings.
-        
-        Raises:
-            None.
-        
-        """
-        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
-        return self._active_adapter
-
-    @property
-    def active_adapters(self):
-        r"""
-        Returns a list of active adapters.
-        
-        Args:
-            self (BaseTunerLayer): The instance of the BaseTunerLayer class.
-        
-        Returns:
-            list: A list of active adapters. If the active_adapter attribute is a string, it will be returned as a single-item list. Otherwise, the active_adapter attribute will be returned as is.
-        
-        Raises:
-            None.
-        """
-        if isinstance(self.active_adapter, str):
-            return [self.active_adapter]
-        # is already a list of str
-        return self.active_adapter
-
-    def enable_adapters(self, enabled: bool) -> None:
-        """Toggle the enabling and disabling of adapters
-
-        Takes care of setting the requires_grad flag for the adapter weights.
-
-        Args:
-            enabled (bool): True to enable adapters, False to disable adapters
-        """
-        if enabled:
-            self.set_adapter(self.active_adapters)
-            self._disable_adapters = False
-        else:
-            # disable grads on all adapter layers
-            for layer_name in self.adapter_layer_names:
-                layer = getattr(self, layer_name)
-                layer.set_grad(requires_grad=False)
-            self._disable_adapters = True
-
-    def set_adapter(self, adapter_names: str | list[str]) -> None:
-        """Set the active adapter(s).
-
-        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.named_parameters():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated.
-        """
-        if isinstance(adapter_names, str):
-            adapter_names = [adapter_names]
-
-        # Deactivate grads on the inactive adapter and activate grads on the active adapter
-        for layer_name in self.adapter_layer_names:
-            module_dict = getattr(self, layer_name)
-            for key, layer in module_dict.items():
-                if key in adapter_names:
-                    # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
-                    # happen if a completely different adapter layer is being activated.
-                    layer.requires_grad = True
-                else:
-                    layer.requires_grad = False
-
-        self._active_adapter = adapter_names
-
-    def _all_available_adapter_names(self) -> list[str]:
-        """Return a sorted list of all available adapter names"""
-        adapter_names = set()
-        for name in self.adapter_layer_names + self.other_param_names:
-            # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter
-            # names
-            attr = getattr(self, name)
-            if hasattr(attr, "keys"):
-                adapter_names.update(attr.keys())
-        return sorted(adapter_names)
-
-    def delete_adapter(self, adapter_name: str) -> None:
-        """
-        Delete an adapter from the layer
-
-        This should be called on all adapter layers, or else we will get an inconsistent state.
-
-        This method will also set a new active adapter if the deleted adapter was an active adapter. It is important
-        that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers.
-
-        Args:
-            adapter_name (`str`): The name of the adapter to delete
-
-        """
-        for attr in self.adapter_layer_names + self.other_param_names:
-            if adapter_name in getattr(self, attr):
-                del getattr(self, attr)[adapter_name]
-
-        if adapter_name in self.active_adapters:
-            # choose a new active adapter
-            active_adapters = self.active_adapters[:]
-            active_adapters.remove(adapter_name)
-            if active_adapters:
-                self.set_adapter(active_adapters)
-            else:
-                # no active adapters left, set a new default adapter
-                # here we get the list of all adapters existing adapter names and choose the first one
-                remaining_adapters = self._all_available_adapter_names()
-                if not remaining_adapters:
-                    self.set_adapter([])
-                else:
-                    new_active_adapter = remaining_adapters[0]
-                    warnings.warn(
-                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to "
-                        f"{new_active_adapter}."
-                    )
-                    self.set_adapter(remaining_adapters[0])
-
-def check_adapters_to_merge(cell: BaseTunerLayer, adapter_names: Optional[list[str]] = None) -> list[str]:
-    """
-    Helper function to check which adapters should be merged.
-
-    Only return those adapters that are not already merged. Give a warning if some or all of the adapters are already
-    merged.
-
-    """
-    if adapter_names is None:
-        adapter_names = cell.active_adapters
-
-    if cell.merged:
-        merged_adapters = set(cell.merged_adapters)
-        adapter_names = [name for name in adapter_names if name not in merged_adapters]
-
-        if adapter_names:
-            warnings.warn(
-                f"Already following adapters were merged {','.join(cell.merged_adapters)}. "
-                f"You are now additionally merging {','.join(adapter_names)}."
-            )
-        else:
-            warnings.warn("All adapters are already merged, nothing to do.")
-
-    return adapter_names
-
-def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
-    """A helper method to check if the passed cell's key name matches any of the target modules in the adapter_config.
-
-    Args:
-        config (`LoraConfig` | `LycorisConfig`): A config to match target modules from
-        key (`str`): A key to search any matches in config
-
-    Returns:
-        `bool` | `re.Match[str]` | `None`: True of match object if key matches any target modules from config, False or
-        None if no match found
-    """
-    if isinstance(config.target_modules, str):
-        target_module_found = re.fullmatch(config.target_modules, key)
-    elif key in config.target_modules:
-        # this cell is specified directly in target_modules
-        target_module_found = True
-    else:
-        target_module_found = any(key.endswith(f".{target_key}") for target_key in config.target_modules)
-
-        layer_indexes = getattr(config, "layers_to_transform", None)
-        layers_pattern = getattr(config, "layers_pattern", None)
-
-        is_using_layer_indexes = layer_indexes is not None and (
-            len(layer_indexes) != 0 if isinstance(layer_indexes, list) else True
-        )
-        if is_using_layer_indexes and target_module_found:
-            layer_index = None
-            if layers_pattern is None or len(layers_pattern) == 0:
-                layer_index = re.match(r".*\.[^.]*\.(\d+)\.", key)
-            else:
-                layers_pattern = [layers_pattern] if isinstance(layers_pattern, str) else layers_pattern
-                for pattern in layers_pattern:
-                    layer_index = re.match(rf".*\.{pattern}\.(\d+)\.", key)
-                    if layer_index is not None:
-                        break
-
-            if layer_index is None:
-                target_module_found = False
-            else:
-                layer_index = int(layer_index.group(1))
-                if isinstance(layer_indexes, int):
-                    target_module_found = layer_index == layer_indexes
-                else:
-                    target_module_found = layer_index in layer_indexes
-
-    return target_module_found
-
-
-def clone_cell(cell: nn.Module, share_weights=False):
-    """Clone a cell in a pytorch model.
-
-    Clones a cell of a model, optionally sharing all the parameters between the original and the clone. Simplifies
-    reusing a cell when manipulating the architecture of a model.
-    """
-    clone = copy.deepcopy(cell)
-
-    def _share_weights(src: nn.Module, dst: nn.Module):
-        for name, param in src.parameters_and_names(expand=False):
-            setattr(dst, name, param)
-
-    if share_weights:
-        for name, submodule in cell.parameters_and_names():
-            _share_weights(submodule, clone.get_submodule(name))
-
-    return clone
-
-def replicate_layers(model: nn.Module, layer_map: list[tuple[int, int]]):
-    """Replicate layers in a transfomer model with weight sharing.
-
-    This function looks for a cell list attribute at model[(.model)*].layers and replicates the layers in the cell
-    list according to the layer map. For example the map `[[0, 4], [2, 5]]` will take the set of layers `[0, 1, 2, 3,
-    4]` and replace them with a cell list containing `[0, 1, 2, 3, 2, 3, 4]`.
-    """
-    while hasattr(model, "model"):
-        model = model.model
-    # Some variants of the bert model nest the main model under the bert attribute.
-    if hasattr(model, "bert"):
-        model = model.bert
-
-    model_type = None
-    layers: nn.ModuleList = None
-    if hasattr(model, "layers"):
-        model_type = "llama"
-        layers = model.layers
-    elif hasattr(model, "encoder") and hasattr(model.encoder, "layer"):
-        model_type = "bert"
-        layers = model.encoder.layer
-    elif hasattr(model, "h"):
-        model_type = "falcon"
-        layers = model.h
-    if not model_type or not isinstance(layers, nn.ModuleList):
-        raise ValueError(
-            "Could not locate the layers attribute in the model. "
-            "Expected Llama, Bert or Falcon compatible architectures."
-        )
-
-    new_layers = []
-    for start, end in layer_map:
-        for i in range(start, end):
-            current_idx = len(new_layers)
-            new_layers.append(clone_cell(layers[i], share_weights=True))
-            # This is a hack needed to work around the layer_idx introduced in HF transformers.
-            for submodule in new_layers[-1].modules():
-                if hasattr(submodule, "layer_idx"):
-                    submodule.layer_idx = current_idx
-    layers = nn.ModuleList(new_layers)
-    if model_type == "llama":
-        model.layers = layers
-    elif model_type == "bert":
-        model.encoder.layer = layers
-    elif model_type == "falcon":
-        model.h = layers
-    else:
-        raise ValueError("Unexpected model type, need to handle post-processing of layers.")
-    if hasattr(model.config, "num_hidden_layers"):  # Common to Llama, Bert, Falcon.
-        model.config.num_hidden_layers = len(new_layers)
diff --git a/mindnlp/peft/utils/__init__.py b/mindnlp/peft/utils/__init__.py
deleted file mode 100644
index 3fa71a68f..000000000
--- a/mindnlp/peft/utils/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""utils for peft"""
-from .peft_types import PeftType, TaskType
-from .other import (
-    _set_trainable,
-    # add_library_to_model_card,
-    # prepare_model_for_int8_training,
-    # prepare_model_for_kbit_training,
-    shift_tokens_right,
-    transpose,
-    _get_batch_size,
-    _get_submodules,
-    _set_adapter,
-    _freeze_adapter,
-    ModulesToSaveWrapper,
-    _prepare_prompt_learning_config,
-    # _is_valid_match,
-)
-# from .hub_utils import hub_file_exists
-from .save_and_load import (
-    get_peft_model_state_dict,
-    set_peft_model_state_dict,
-    load_peft_weights,
-)
-from .constants import (
-    bloom_model_postprocess_past_key_value,
-    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
-    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
-    TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING,
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    SAFETENSORS_WEIGHTS_NAME,
-)
diff --git a/mindnlp/peft/utils/constants.py b/mindnlp/peft/utils/constants.py
deleted file mode 100644
index 8e7053da8..000000000
--- a/mindnlp/peft/utils/constants.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""constants for peft"""
-from mindspore import ops
-
-# needed for prefix-tuning of bloom model
-def bloom_model_postprocess_past_key_value(past_key_values):
-    r"""
-    This function takes a single parameter 'past_key_values' and performs post-processing on it.
-
-    Args:
-        past_key_values (Tensor): A tensor containing past key values. The shape of the tensor is (total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim).
-
-    Returns:
-        tuple: A tuple containing two elements, 'keys' and 'values'.
-            - 'keys' (Tensor): A tensor containing the processed keys. The shape of the tensor is (total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens).
-            - 'values' (Tensor): A tensor containing the processed values. The shape of the tensor is (total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim).
-
-    Raises:
-        None.
-    """
-    past_key_values = ops.cat(past_key_values)
-    total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape
-    keys = past_key_values[: total_layers // 2]
-    keys = keys.swapaxes(2, 3).reshape(
-        total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens
-    )
-    values = past_key_values[total_layers // 2 :]
-    values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim)
-
-    return tuple(zip(keys, values))
-
-
-# needed for prefix-tuning of StarCoder models
-def starcoder_model_postprocess_past_key_value(past_key_values):
-    r"""
-    Args:
-        past_key_values (list): A list of past key values in a specific format.
-
-    Returns:
-        tuple: A tuple containing processed key values.
-
-    Raises:
-        None.
-    """
-    result = []
-    for k in past_key_values:
-        k = k[:, :, 0]
-        k = k.permute([1, 2, 0, 3])
-        k = k.reshape(*k.shape[:-2], -1)
-        result.append(k)
-    return tuple(result)
-
-
-TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
-    "bloom": bloom_model_postprocess_past_key_value,
-    "gpt_bigcode": starcoder_model_postprocess_past_key_value,
-}
-TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING = {
-    "llama": ["input_layernorm", "post_attention_layernorm", "norm"],
-    "bloom": ["input_layernorm", "post_attention_layernorm", "ln_f"],
-    "llava": [
-        "multi_modal_projector",
-        "input_layernorm",
-        "post_attention_layernorm",
-        "norm",
-        "embed_tokens",
-        "lm_head",
-    ],
-    "t5": ["layer_norm", "final_layer_norm"],
-    "mt5": ["layer_norm", "final_layer_norm"],
-    "bart": ["self_attn_layer_norm", "encoder_attn_layer_norm", "final_layer_norm"],
-    "gpt2": ["ln_1", "ln_2", "ln_f"],
-    "blip-2": ["layernorm", "LayerNorm", "final_layer_norm", "self_attn_layer_norm"],
-    "gptj": ["ln_1", "ln_f"],
-    "falcon": ["input_layernorm", "post_attention_layernorm", "ln_f"],
-    "mistral": ["input_layernorm", "post_attention_layernorm", "norm"],
-    "phi": ["input_layernorm", "final_layernorm"],
-    "gemma": ["input_layernorm", "post_attention_layernorm", "norm"],
-}
-
-TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING = {
-    "llama": ["input_layernorm", "post_attention_layernorm", "norm"],
-    "bloom": ["input_layernorm", "post_attention_layernorm", "ln_f"],
-    "llava": [
-        "multi_modal_projector",
-        "input_layernorm",
-        "post_attention_layernorm",
-        "norm",
-        "embed_tokens",
-        "lm_head",
-    ],
-    "t5": ["layer_norm", "final_layer_norm"],
-    "mt5": ["layer_norm", "final_layer_norm"],
-    "bart": ["self_attn_layer_norm", "encoder_attn_layer_norm", "final_layer_norm"],
-    "gpt2": ["ln_1", "ln_2", "ln_f"],
-    "blip-2": ["layernorm", "LayerNorm", "final_layer_norm", "self_attn_layer_norm"],
-    "gptj": ["ln_1", "ln_f"],
-    "falcon": ["input_layernorm", "post_attention_layernorm", "ln_f"],
-    "mistral": ["input_layernorm", "post_attention_layernorm", "norm"],
-    "phi": ["input_layernorm", "final_layernorm"],
-    "gemma": ["input_layernorm", "post_attention_layernorm", "norm"],
-}
-
-TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
-    "t5": ["q", "v"],
-    "mt5": ["q", "v"],
-    "bart": ["q_proj", "v_proj"],
-    "gpt2": ["c_attn"],
-    "bloom": ["query_key_value"],
-    "blip-2": ["q", "v", "q_proj", "v_proj"],
-    "opt": ["q_proj", "v_proj"],
-    "gptj": ["q_proj", "v_proj"],
-    "gpt_neox": ["query_key_value"],
-    "gpt_neo": ["q_proj", "v_proj"],
-    "bert": ["query", "value"],
-    "roberta": ["query", "value"],
-    "xlm-roberta": ["query", "value"],
-    "electra": ["query", "value"],
-    "deberta-v2": ["query_proj", "value_proj"],
-    "deberta": ["in_proj"],
-    "layoutlm": ["query", "value"],
-    "llama": ["q_proj", "v_proj"],
-    "chatglm": ["query_key_value"],
-    "gpt_bigcode": ["c_attn"],
-    "mpt": ["Wqkv"],
-    "RefinedWebModel": ["query_key_value"],
-    "RefinedWeb": ["query_key_value"],
-    "falcon": ["query_key_value"],
-    "btlm": ["c_proj", "c_attn"],
-    "codegen": ["qkv_proj"],
-    "mistral": ["q_proj", "v_proj"],
-    "mixtral": ["q_proj", "v_proj"],
-    "stablelm": ["q_proj", "v_proj"],
-    "phi": ["q_proj", "v_proj", "fc1", "fc2"],
-    "gemma": ["q_proj", "v_proj"],
-}
-
-TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = {
-    "t5": ["k", "v", "wo"],
-    "mt5": ["k", "v", "wi_1"],
-    "gpt2": ["c_attn", "mlp.c_proj"],
-    "bloom": ["query_key_value", "mlp.dense_4h_to_h"],
-    "roberta": ["key", "value", "output.dense"],
-    "opt": ["q_proj", "k_proj", "fc2"],
-    "gptj": ["q_proj", "v_proj", "fc_out"],
-    "gpt_neox": ["query_key_value", "dense_4h_to_h"],
-    "gpt_neo": ["q_proj", "v_proj", "c_proj"],
-    "bart": ["q_proj", "v_proj", "fc2"],
-    "gpt_bigcode": ["c_attn", "mlp.c_proj"],
-    "llama": ["k_proj", "v_proj", "down_proj"],
-    "mistral": ["k_proj", "v_proj", "down_proj"],
-    "mixtral": ["k_proj", "v_proj", "w2"],
-    "bert": ["key", "value", "output.dense"],
-    "deberta-v2": ["key_proj", "value_proj", "output.dense"],
-    "deberta": ["in_proj", "output.dense"],
-    "RefinedWebModel": ["query_key_value", "dense_4h_to_h"],
-    "RefinedWeb": ["query_key_value", "dense_4h_to_h"],
-    "falcon": ["query_key_value", "dense_4h_to_h"],
-    "phi": ["q_proj", "v_proj", "fc2"],
-    "gemma": ["q_proj", "v_proj", "down_proj"],
-}
-
-TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = {
-    "t5": ["wo"],
-    "mt5": [],
-    "gpt2": ["mlp.c_proj"],
-    "bloom": ["mlp.dense_4h_to_h"],
-    "roberta": ["output.dense"],
-    "opt": ["fc2"],
-    "gptj": ["fc_out"],
-    "gpt_neox": ["dense_4h_to_h"],
-    "gpt_neo": ["c_proj"],
-    "bart": ["fc2"],
-    "gpt_bigcode": ["mlp.c_proj"],
-    "llama": ["down_proj"],
-    "mistral": ["down_proj"],
-    "mixtral": ["w2"],
-    "bert": ["output.dense"],
-    "deberta-v2": ["output.dense"],
-    "deberta": ["output.dense"],
-    "RefinedWeb": ["dense_4h_to_h"],
-    "RefinedWebModel": ["dense_4h_to_h"],
-    "falcon": ["dense_4h_to_h"],
-    "phi": ["fc2"],
-    "gemma": ["down_proj"],
-}
-
-TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {
-    "t5": ["q", "k", "v", "o", "wi", "wo"],
-    "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
-    "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
-    "gpt2": ["c_attn"],
-    "bloom": ["query_key_value"],
-    "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
-    "gptj": ["q_proj", "v_proj"],
-    "gpt_neox": ["query_key_value"],
-    "gpt_neo": ["q_proj", "v_proj"],
-    "llama": ["q_proj", "v_proj"],
-    "bert": ["query", "value"],
-    "roberta": ["query", "key", "value", "dense"],
-    # "xlm-roberta": ["query", "value"],
-    # "electra": ["query", "value"],
-    "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
-    "gpt_bigcode": ["c_attn"],
-    "deberta": ["in_proj"],
-    # "layoutlm": ["query", "value"],
-}
-
-TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING = {
-    "t5": ["q", "v"],
-    "mt5": ["q", "v"],
-    "bart": ["q_proj", "v_proj"],
-    "gpt2": ["c_attn"],
-    "bloom": ["query_key_value"],
-    "blip-2": ["q", "v", "q_proj", "v_proj"],
-    "opt": ["q_proj", "v_proj"],
-    "gptj": ["q_proj", "v_proj"],
-    "gpt_neox": ["query_key_value"],
-    "gpt_neo": ["q_proj", "v_proj"],
-    "bert": ["query", "value"],
-    "roberta": ["query", "value"],
-    "xlm-roberta": ["query", "value"],
-    "electra": ["query", "value"],
-    "deberta-v2": ["query_proj", "value_proj"],
-    "deberta": ["in_proj"],
-    "layoutlm": ["query", "value"],
-    "llama": ["q_proj", "v_proj"],
-    "chatglm": ["query_key_value"],
-    "gpt_bigcode": ["c_attn"],
-    "mpt": ["Wqkv"],
-    "RefinedWebModel": ["query_key_value"],
-    "RefinedWeb": ["query_key_value"],
-    "falcon": ["query_key_value"],
-    # "btlm": ["c_proj", "c_attn"],  # tested, does not work because of different shapes
-    "codegen": ["qkv_proj"],
-    # "mistral": ["q_proj", "v_proj"],  # tested, does not work because of different shapes
-    # "mixtral": ["q_proj", "v_proj"],  # tested, does not work because of different shapes
-    "stablelm": ["q_proj", "v_proj"],
-    # "phi": ["q_proj", "v_proj", "fc1", "fc2"],  # tested, does not work because of different shapes
-    "phi": ["q_proj", "v_proj"],
-    # "gemma": ["q_proj", "v_proj"],  # tested, does not work because of different shapes
-}
-
-WEIGHTS_NAME = "adapter_model.ckpt"
-SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
-CONFIG_NAME = "adapter_config.json"
-EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"]
-INCLUDE_LINEAR_LAYERS_SHORTHAND = "all-linear"
-TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
diff --git a/mindnlp/peft/utils/merge_utils.py b/mindnlp/peft/utils/merge_utils.py
deleted file mode 100644
index 8f63ee395..000000000
--- a/mindnlp/peft/utils/merge_utils.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright 2024-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""merge utils"""
-import warnings
-from typing import List
-try:
-    from typing import Literal
-except:
-    from typing_extensions import Literal
-import mindspore
-from mindspore import ops
-
-
-def reshape_weight_task_tensors(task_tensors, weights):
-    """
-    Reshapes `weights` to match the shape of `task_tensors` by unsqeezing in the remaining dimenions.
-
-    Args:
-        task_tensors (`mindspore.Tensor`): The tensors that will be used to reshape `weights`.
-        weights (`mindspore.Tensor`): The tensor to be reshaped.
-
-    Returns:
-        `mindspore.Tensor`: The reshaped tensor.
-    """
-    new_shape = weights.shape + (1,) * (task_tensors.ndim - weights.ndim)
-    weights = weights.view(new_shape)
-    return weights
-
-
-def magnitude_based_pruning(tensor: mindspore.Tensor, density: float) -> mindspore.Tensor:
-    """
-    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
-    `density`.
-
-    Args:
-        tensor (`mindspore.Tensor`):The tensor to prune.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-
-    Returns:
-        `mindspore.Tensor`: The tensor with the pruned weights.
-    """
-    mask = ops.zeros_like(tensor).reshape(-1)
-    k = int(density * tensor.numel())
-    top_k = ops.topk(tensor.abs().reshape(-1), k=k, largest=True)
-    mask[top_k[1]] = 1
-    return tensor * mask.reshape(tensor.shape)
-
-
-def random_pruning(tensor: mindspore.Tensor, density: float, rescale: bool) -> mindspore.Tensor:
-    """
-    Prune random values based on the specified fraction `density`.
-
-    Args:
-        tensor (`mindspore.Tensor`):The tensor to prune.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-        rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
-
-    Returns:
-        `mindspore.Tensor`: The pruned tensor.
-    """
-    mask = ops.bernoulli(ops.full_like(input=tensor, fill_value=density))
-    pruned_tensor = tensor * mask
-    if rescale:
-        ops.div(input=pruned_tensor, other=density)
-    return pruned_tensor
-
-
-def prune(
-    tensor: mindspore.Tensor, density: float, method: Literal["magnitude", "random"], rescale: bool = False
-) -> mindspore.Tensor:
-    """
-    Prune the values of task tensors based on the `method`.
-
-    Args:
-        tensor (`mindspore.Tensor`):The tensor to prune.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-        method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
-        rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
-
-    Returns:
-        `mindspore.Tensor`: The pruned tensor.
-    """
-    if density >= 1:
-        warnings.warn(f"The density {density} is greater than or equal to 1, no pruning will be performed.")
-        return tensor
-    elif density < 0:
-        raise ValueError(f"Density should be >= 0, got {density}")
-    if method == "magnitude":
-        return magnitude_based_pruning(tensor, density)
-    elif method == "random":
-        return random_pruning(tensor, density, rescale=rescale)
-    else:
-        raise ValueError(f"Unknown method {method}")
-
-
-def calculate_majority_sign_mask(
-    tensor: mindspore.Tensor, method: Literal["total", "frequency"] = "total"
-) -> mindspore.Tensor:
-    """
-    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.
-
-    Args:
-        tensor (`mindspore.Tensor`):The tensor to get the mask from.
-        method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
-
-    Returns:
-        `mindspore.Tensor`: The majority sign mask.
-    """
-    sign = tensor.sign()
-    if method == "total":
-        sign_magnitude = tensor.sum(dim=0)
-    elif method == "frequency":
-        sign_magnitude = sign.sum(dim=0)
-    else:
-        raise RuntimeError(f'Unimplemented mask method "{method}"')
-    majority_sign = ops.where(sign_magnitude >= 0, 1, -1)
-    return sign == majority_sign
-
-
-def disjoint_merge(task_tensors: mindspore.Tensor, majority_sign_mask: mindspore.Tensor) -> mindspore.Tensor:
-    """
-    Merge the task tensors using disjoint merge.
-
-    Args:
-        task_tensors (`mindspore.Tensor`):The task tensors to merge.
-        majority_sign_mask (`mindspore.Tensor`):The mask of the majority sign across the task tensors.
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
-    num_params_preserved = majority_sign_mask.sum(dim=0)
-    return mixed_task_tensors / ops.clamp(num_params_preserved, min=1.0)
-
-
-def task_arithmetic(task_tensors: List[mindspore.Tensor], weights: mindspore.Tensor) -> mindspore.Tensor:
-    """
-    Merge the task tensors using `task arithmetic`.
-
-    Args:
-        task_tensors(`List[mindspore.Tensor]`):The task tensors to merge.
-        weights (`mindspore.Tensor`):The weights of the task tensors.
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    task_tensors = ops.stack(task_tensors, axis=0)
-    # weighted task tensors
-    weights = reshape_weight_task_tensors(task_tensors, weights)
-    weighted_task_tensors = task_tensors * weights
-    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
-    return mixed_task_tensors
-
-
-def magnitude_prune(task_tensors: List[mindspore.Tensor], weights: mindspore.Tensor, density: float) -> mindspore.Tensor:
-    """
-    Merge the task tensors using `task arithmetic`.
-
-    Args:
-        task_tensors(`List[mindspore.Tensor]`):The task tensors to merge.
-        weights (`mindspore.Tensor`):The weights of the task tensors.
-        density (`float`): The fraction of values to preserve. Should be in [0,1].
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    # sparsify
-    task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors]
-    task_tensors = ops.stack(task_tensors, axis=0)
-    # weighted task tensors
-    weights = reshape_weight_task_tensors(task_tensors, weights)
-    weighted_task_tensors = task_tensors * weights
-    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
-    return mixed_task_tensors
-
-
-def ties(
-    task_tensors: List[mindspore.Tensor],
-    weights: mindspore.Tensor,
-    density: float,
-    majority_sign_method: Literal["total", "frequency"] = "total",
-) -> mindspore.Tensor:
-    """
-    Merge the task tensors using `ties`.
-
-    Args:
-        task_tensors(`List[mindspore.Tensor]`):The task tensors to merge.
-        weights (`mindspore.Tensor`):The weights of the task tensors.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-        majority_sign_method (`str`):
-            The method to use to get the majority sign mask. Should be one of ["total", "frequency"].
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    # sparsify
-    task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors]
-    task_tensors = ops.stack(task_tensors, axis=0)
-    # Elect Sign
-    majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method)
-    # weighted task tensors
-    weights = reshape_weight_task_tensors(task_tensors, weights)
-    weighted_task_tensors = task_tensors * weights
-    # Disjoint Merge
-    mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
-    return mixed_task_tensors
-
-
-def dare_linear(task_tensors: List[mindspore.Tensor], weights: mindspore.Tensor, density: float) -> mindspore.Tensor:
-    """
-    Merge the task tensors using `dare linear`.
-
-    Args:
-        task_tensors(`List[mindspore.Tensor]`):The task tensors to merge.
-        weights (`mindspore.Tensor`):The weights of the task tensors.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    # sparsify
-    task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors]
-    task_tensors = ops.stack(task_tensors, axis=0)
-    # weighted task tensors
-    weights = reshape_weight_task_tensors(task_tensors, weights)
-    weighted_task_tensors = task_tensors * weights
-    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
-    return mixed_task_tensors
-
-
-def dare_ties(
-    task_tensors: List[mindspore.Tensor],
-    weights: mindspore.Tensor,
-    density: float,
-    majority_sign_method: Literal["total", "frequency"] = "total",
-) -> mindspore.Tensor:
-    """
-    Merge the task tensors using `dare ties`.
-
-    Args:
-        task_tensors(`List[mindspore.Tensor]`):The task tensors to merge.
-        weights (`mindspore.Tensor`):The weights of the task tensors.
-        density (`float`):The fraction of values to preserve. Should be in [0,1].
-        majority_sign_method (`str`):
-            The method to use to get the majority sign mask. Should be one of ["total", "frequency"].
-
-    Returns:
-        `mindspore.Tensor`: The merged tensor.
-    """
-    # sparsify
-    task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors]
-    task_tensors = ops.stack(task_tensors, axis=0)
-    # Elect Sign
-    majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method)
-    # weighted task tensors
-    weights = reshape_weight_task_tensors(task_tensors, weights)
-    weighted_task_tensors = task_tensors * weights
-    # Disjoint Merge
-    mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
-    return mixed_task_tensors
diff --git a/mindnlp/peft/utils/other.py b/mindnlp/peft/utils/other.py
deleted file mode 100644
index 6c81e55a1..000000000
--- a/mindnlp/peft/utils/other.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""other utils"""
-import copy
-from contextlib import nullcontext
-from typing import Optional, List
-
-import mindspore
-from mindspore import Tensor
-from mindspore.common.initializer import initializer, Normal
-
-from mindnlp.core import nn, ops
-from mindnlp.core.nn import Parameter
-from mindnlp.core.nn import ParameterDict
-
-def _get_batch_size(input_ids: Optional[Tensor], inputs_embeds: Optional[Tensor]) -> int:
-    """Get the batch size based on either input_ids or input_embeds
-
-    Raises an ValueError if both are None.
-
-    """
-    if (input_ids is None) and (inputs_embeds is None):
-        raise ValueError("You have to provide either input_ids or inputs_embeds")
-
-    if input_ids is not None:
-        batch_size = input_ids.shape[0]
-    else:
-        batch_size = inputs_embeds.shape[0]
-    return batch_size
-
-class ModulesToSaveWrapper(nn.Module):
-
-    r"""
-    This class represents a wrapper for saving and managing modules in a neural network. It provides functionality to save and switch between different modules, known as adapters, while also maintaining the
-original module for reference. The class includes methods for enabling and disabling adapters, setting the active adapter, updating the saved modules, and forwarding the model with the appropriate adapter.
-    
-    The class inherits from nn.Module and includes the following methods:
-    - __init__: Initializes the ModulesToSaveWrapper instance with the original cell to save and the initial adapter name.
-    - check_cell: Performs sanity checks on the original cell to ensure compatibility with the saving mechanism.
-    - disable_adapters: Toggles the enabling and disabling of adapters, managing the requires_grad flag for adapter weights.
-    - active_adapter: Returns the name of the currently active adapter.
-    - weight: Retrieves the weight of the original cell or the active adapter's cell if available.
-    - update: Updates the saved modules with a new adapter, creating a deep copy of the original cell.
-    - forward: Constructs the model using the original cell or the active adapter's cell based on the adapter status.
-    - enable_adapters: Toggles the enabling and disabling of adapters, managing the requires_grad flag for adapter weights.
-    - set_adapter: Sets the active adapter, making it trainable and updating the requires_grad flag for the modules.
-    
-    The class provides a flexible way to manage and switch between different modules within a neural network.
-    """
-    def __init__(self, module_to_save, adapter_name):
-        r"""
-        Initializes an instance of the ModulesToSaveWrapper class.
-        
-        Args:
-            self (ModulesToSaveWrapper): The current instance of the class.
-            module_to_save (Any): The cell to be saved.
-            adapter_name (str): The name of the adapter.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        self.original_cell = module_to_save
-        self.modules_to_save = nn.ModuleDict({})
-        self._active_adapter = adapter_name
-        self._disable_adapters = False
-        self.update(adapter_name)
-        self.check_cell()
-
-    def check_cell(self):
-        """Perform some sanity checks on the cell to ensure that it works"""
-        # Try to anticipate some modules that users could try to target that would not work.
-        # Note: It's not possible to check hasattr(cell, "forward"), since that returns True for ModuleDict and
-        # ModuleList, even though their forward methods cannot be called
-        forbidden_classes = (nn.ModuleDict, nn.ModuleList, ParameterDict)
-        if isinstance(self.original_cell, forbidden_classes):
-            cls_name = self.original_cell.__class__.__name__
-            raise TypeError(f"modules_to_save cannot be applied to modules of type {cls_name}")
-
-    @property
-    def disable_adapters(self) -> bool:
-        r"""
-        Method to retrieve the status of whether adapters are disabled in the ModulesToSaveWrapper class.
-        
-        Args:
-            self (ModulesToSaveWrapper): The instance of the ModulesToSaveWrapper class.
-                This parameter is always required as it refers to the instance calling this method.
-        
-        Returns:
-            bool: A boolean value indicating whether adapters are disabled.
-                Returns True if adapters are disabled, False otherwise. 
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
-        return self._disable_adapters
-
-    @property
-    def active_adapter(self) -> str:
-        r"""
-        This method retrieves the active adapter from the ModulesToSaveWrapper class.
-        
-        Args:
-            self: The instance of the ModulesToSaveWrapper class.
-        
-        Returns:
-            str: The active adapter as a string.
-        
-        Raises:
-            None
-        """
-        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
-        return self._active_adapter
-
-    @property
-    def weight(self):
-        r"""
-        This method 'weight' is a property method within the 'ModulesToSaveWrapper' class.
-        
-        Args:
-            - self: (object) The instance of the 'ModulesToSaveWrapper' class.
-        
-        Returns:
-            - None: This method returns a value of type None.
-        
-        Raises:
-            - None: This method does not raise any exceptions.
-        """
-        if self.active_adapter not in self.modules_to_save:
-            return self.original_cell.weight
-        return self.modules_to_save[self.active_adapter].weight
-
-    def update(self, adapter_name):
-        r"""
-        Updates the ModulesToSaveWrapper with a new adapter.
-        
-        Args:
-            self (ModulesToSaveWrapper): The instance of ModulesToSaveWrapper.
-            adapter_name (str): The name of the adapter to update.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            - AttributeError: If the 'modules_to_save' attribute does not contain the specified 'adapter_name'.
-            - RuntimeError: If an error occurs during the update process.
-            - ValueError: If the 'adapter_name' parameter is not a string.
-        """
-        context_manager = nullcontext()
-        for _, param in self.original_cell.parameters_and_names():
-            num_params = param.numel()
-
-        with context_manager:
-            self.modules_to_save.update(nn.ModuleDict({adapter_name: copy.deepcopy(self.original_cell)}))
-
-        if hasattr(self.modules_to_save[adapter_name], "_hf_hook"):
-            old_hook = self.modules_to_save[adapter_name]._hf_hook
-            new_hook = self._create_new_hook(old_hook)
-            # remove_hook_from_cell(self.modules_to_save[adapter_name])
-            # add_hook_to_cell(self.modules_to_save[adapter_name], new_hook)
-
-        self.original_cell.requires_grad_(False)
-        if adapter_name == self.active_adapter:
-            self.modules_to_save[adapter_name].requires_grad_(True)
-
-    # def _create_new_hook(self, old_hook):
-    #     r"""
-    #     Creates a new hook based on the old hook. Use it only if you know what you are doing !
-    #     """
-    #     old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__)
-    #     old_hook_attr = old_hook.__dict__
-    #     filtered_old_hook_attr = {}
-    #     old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
-    #     for k in old_hook_attr.keys():
-    #         if k in old_hook_init_signature.parameters:
-    #             filtered_old_hook_attr[k] = old_hook_attr[k]
-    #     new_hook = old_hook_cls(**filtered_old_hook_attr)
-    #     return new_hook
-
-    def forward(self, *args, **kwargs):
-        r"""
-        This method forwards and returns the appropriate cell based on the active adapter within the ModulesToSaveWrapper class.
-        
-        Args:
-            self: An instance of the ModulesToSaveWrapper class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            - N/A
-        """
-        if self.disable_adapters or (self.active_adapter not in self.modules_to_save):
-            return self.original_cell(*args, **kwargs)
-        return self.modules_to_save[self.active_adapter](*args, **kwargs)
-
-    def enable_adapters(self, enabled: bool):
-        """Toggle the enabling and disabling of adapters
-
-        Takes care of setting the requires_grad flag for the adapter weights.
-
-        Args:
-            enabled (bool): True to enable adapters, False to disable adapters
-        """
-        if self._disable_adapters is not enabled:
-            # already in the desired state, do nothing
-            return
-
-        if enabled:
-            self.original_cell.requires_grad_(False)
-            self.modules_to_save[self.active_adapter].requires_grad_(True)
-            self._disable_adapters = False
-        else:
-            self.original_cell.requires_grad_(True)
-            self.modules_to_save.requires_grad_(False)
-            self._disable_adapters = True
-
-    def set_adapter(self, adapter_name: str):
-        """Set the active adapter
-
-        Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is
-        not desired, use the following code.
-
-        ```py
-        >>> for name, param in model_peft.named_parameters():
-        ...     if ...:  # some check on name (ex. if 'lora' in name)
-        ...         param.requires_grad = False
-        ```
-
-        Args:
-            adapter_name (str): The name of the adapter to set as active
-        """
-        if adapter_name not in self.modules_to_save:
-            raise ValueError(f"Adapter {adapter_name} not found in {self.modules_to_save.keys()}")
-
-        self.modules_to_save[self.active_adapter].requires_grad_(False)
-        self.modules_to_save[adapter_name].requires_grad_(True)
-        self._active_adapter = adapter_name
-
-
-def custom_get_submodule(model: nn.Module, target: str) -> nn.Module:
-    """
-    Returns the submodule given by ``target`` if it exists, otherwise throws an error.
-    功能和 torch.nn.Module 相似
-    """
-    if target == "":
-        return model
-
-    atoms: List[str] = target.split(".")
-    mod: nn.Module = model
-
-    for item in atoms:
-        if not hasattr(mod, item):
-            raise AttributeError(mod + " has no attribute `" + item + "`")
-
-        mod = getattr(mod, item)
-
-        if not isinstance(mod, nn.Module):
-            raise AttributeError("`" + item + "` is not an nn.Module")
-
-    return mod
-
-def _get_submodules(model, key):
-    """
-    get submodules
-    """
-    parent_key = ".".join(key.split(".")[:-1])
-    parent = custom_get_submodule(model, parent_key)
-    target_name = key.split(".")[-1]
-    target = custom_get_submodule(model, key)
-
-    return parent, target, target_name
-
-
-def _set_trainable(model, adapter_name):
-    """
-    set trainable
-    """
-    key_list = [key for key, _ in model.named_modules()]  # named_modules
-    for key in key_list:
-        target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save)
-        if target_module_found:
-            parent, target, target_name = _get_submodules(model, key)
-
-            if isinstance(target, ModulesToSaveWrapper):
-                target.update(adapter_name)
-            else:
-                for _, param in target.parameters_and_names():
-                    param.requires_grad = True
-                warp_cell = ModulesToSaveWrapper(target, adapter_name)
-                # parent[int(target_name)] = warp_cell
-                setattr(parent, target_name, warp_cell)
-
-                # TODO:the implemtation of mindspore, __setitem__ is not consistent with __setattr__ here.
-                # self.module_list is not set correctly if __setattr__'s value type is Sequential.
-                # Thus we set it apparently here. This line may be removed later.
-                if isinstance(parent, nn.Sequential):
-                    parent.module_list = list(parent._modules.values())
-
-
-def _freeze_adapter(model, adapter_name):
-    """
-    freeze adapter
-    """
-    for n, p in model.parameters_and_names():
-        if adapter_name in n:
-            p.requires_grad = False
-
-
-def _set_adapter(model, adapter_name):
-    r"""
-    Sets the active adapter for the given model.
-    
-    Args:
-        model (object): The model for which the active adapter needs to be set.
-        adapter_name (str): The name of the adapter to be set as active.
-    
-    Returns:
-        None. This function does not return any value.
-    
-    Raises:
-        None. This function does not raise any exceptions.
-    """
-    for cell in model.modules():
-        if isinstance(cell, ModulesToSaveWrapper):
-            cell.active_adapter = adapter_name
-
-
-def _prepare_prompt_learning_config(peft_config, model_config):
-    r"""
-    Args:
-        peft_config (object): The PEFT configuration object containing the parameters for prompt learning.
-        model_config (dict): The model configuration dictionary containing the parameters for the underlying model.
-    
-    Returns:
-        None. The function modifies the peft_config object in place.
-    
-    Raises:
-        ValueError: If 'num_layers', 'token_dim', or 'num_attention_heads' is not specified in peft_config or model_config.
-    """
-    if peft_config.num_layers is None:
-        if "num_hidden_layers" in model_config:
-            num_layers = model_config["num_hidden_layers"]
-        elif "num_layers" in model_config:
-            num_layers = model_config["num_layers"]
-        elif "n_layer" in model_config:
-            num_layers = model_config["n_layer"]
-        else:
-            raise ValueError("Please specify `num_layers` in `peft_config`")
-        peft_config.num_layers = num_layers
-
-    if peft_config.token_dim is None:
-        if "hidden_size" in model_config:
-            token_dim = model_config["hidden_size"]
-        elif "n_embd" in model_config:
-            token_dim = model_config["n_embd"]
-        elif "d_model" in model_config:
-            token_dim = model_config["d_model"]
-        else:
-            raise ValueError("Please specify `token_dim` in `peft_config`")
-        peft_config.token_dim = token_dim
-
-    if peft_config.num_attention_heads is None:
-        if "num_attention_heads" in model_config:
-            num_attention_heads = model_config["num_attention_heads"]
-        elif "n_head" in model_config:
-            num_attention_heads = model_config["n_head"]
-        elif "num_heads" in model_config:
-            num_attention_heads = model_config["num_heads"]
-        elif "encoder_attention_heads" in model_config:
-            num_attention_heads = model_config["encoder_attention_heads"]
-        else:
-            raise ValueError("Please specify `num_attention_heads` in `peft_config`")
-        peft_config.num_attention_heads = num_attention_heads
-
-    if getattr(peft_config, "encoder_hidden_size", None) is None:
-        setattr(peft_config, "encoder_hidden_size", peft_config.token_dim)
-
-    return peft_config
-
-
-def transpose(weight, fan_in_fan_out):
-    """
-    transpose weight
-    """
-    return weight.T if fan_in_fan_out else weight
-    # return ops.transpose(weight, input_perm=?) if fan_in_fan_out else weight
-
-
-def shift_tokens_right(input_ids: mindspore.Tensor, pad_token_id: int, decoder_start_token_id: int):
-    """
-    Shift input ids one token to the right.
-
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
-        pad_token_id (`int`): The id of the `padding` token.
-        decoder_start_token_id (`int`): The id of the `start` token.
-    """
-    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
-
-    if pad_token_id is None:
-        raise ValueError("self.model.config.pad_token_id has to be defined.")
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids = shifted_input_ids.masked_fill(shifted_input_ids == -100, pad_token_id)
-
-    return shifted_input_ids
-
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        n_out (`int`): The number of output features.
-        n_in (`int`): The number of input features.
-    """
-    def __init__(self, n_out, n_in):
-        r"""
-        Initializes an instance of the Conv1D class.
-        
-        Args:
-            self (Conv1D): The instance of the Conv1D class.
-            n_out (int): The number of output channels or filters.
-            n_in (int): The number of input channels or filters.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        self.n_out = n_out
-        self.weight = Parameter(initializer(Normal(sigma=0.02), (n_in, n_out), mindspore.float32))
-        self.bias = Parameter(ops.zeros(n_out, mindspore.float32))
-
-    def forward(self, x):
-        r"""
-        Constructs the output of a 1D convolutional layer.
-        
-        Args:
-            self (Conv1D): The instance of the Conv1D class.
-            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, input_features) representing the input data.
-        
-        Returns:
-            torch.Tensor: The output tensor of shape (batch_size, sequence_length, n_out) representing the result of the 1D convolution operation.
-        
-        Raises:
-            - ValueError: If the input tensor 'x' does not have the expected shape.
-            - RuntimeError: If an error occurs during the matrix multiplication or bias addition operations.
-        """
-        size_out = x.shape[:-1] + (self.n_out,)
-        x = self.matmul(x.view(-1, x.shape[-1]), self.weight) + self.bias
-        x = x.view(size_out)
-        return x
diff --git a/mindnlp/peft/utils/peft_types.py b/mindnlp/peft/utils/peft_types.py
deleted file mode 100644
index 31c6cb7a8..000000000
--- a/mindnlp/peft/utils/peft_types.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Peft types and Task types.
-"""
-import enum
-
-class PeftType(str, enum.Enum):
-    """
-    Enum class for the different types of adapters in PEFT.
-
-    Supported PEFT types: 
-    - PROMPT_TUNING 
-    - MULTITASK_PROMPT_TUNING 
-    - P_TUNING 
-    - PREFIX_TUNING 
-    - LORA 
-    - ADALORA 
-    - BOFT 
-    - ADAPTION_PROMPT 
-    - IA3 
-    - LOHA 
-    - LOKR 
-    - OFT 
-    - POLY 
-    - LN_TUNING 
-    """
-    PROMPT_TUNING = "PROMPT_TUNING"
-    MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
-    P_TUNING = "P_TUNING"
-    PREFIX_TUNING = "PREFIX_TUNING"
-    LORA = "LORA"
-    ADALORA = "ADALORA"
-    BOFT = "BOFT"
-    ADAPTION_PROMPT = "ADAPTION_PROMPT"
-    IA3 = "IA3"
-    LOHA = "LOHA"
-    LOKR = "LOKR"
-    OFT = "OFT"
-    POLY = "POLY"
-    LN_TUNING = "LN_TUNING"
-    VERA = "VERA"
-
-
-class TaskType(str, enum.Enum):
-    """
-    TaskType
-    """
-    SEQ_CLS = "SEQ_CLS"
-    SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
-    CAUSAL_LM = "CAUSAL_LM"
-    TOKEN_CLS = "TOKEN_CLS"
-    QUESTION_ANS = "QUESTION_ANS"
-    FEATURE_EXTRACTION = "FEATURE_EXTRACTION"
diff --git a/mindnlp/peft/utils/save_and_load.py b/mindnlp/peft/utils/save_and_load.py
deleted file mode 100644
index cc2be721c..000000000
--- a/mindnlp/peft/utils/save_and_load.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""save and load"""
-import os
-from collections import OrderedDict
-
-import mindspore
-
-from .peft_types import PeftType
-from .constants import WEIGHTS_NAME
-
-def get_data_list(param_dict):
-    """Get state dict of the Peft model for saving."""
-    data_list = OrderedDict()  # {key: [dims, tensor_type, data]}
-
-    for key, value in param_dict.items():
-        data_list[key] = []
-        dims = []
-        if value.shape == ():
-            dims.append(0)
-        else:
-            for dim in value.shape:
-                dims.append(dim)
-        data_list[key].append(dims)
-        tensor_type = str(value.dtype)
-        data_list[key].append(tensor_type)
-        data = value.asnumpy().reshape(-1)
-        data_list[key].append(data)
-
-    return data_list
-
-
-def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"):
-    """
-    Get the state dict of the Peft model.
-
-    Args:
-        model ([`PeftModel`]): The Peft model.
-    """
-    config = model.peft_config[adapter_name]
-    if state_dict is None:
-        state_dict = get_data_list(model.parameters_dict())
-    if config.peft_type in (PeftType.LORA, PeftType.ADALORA):
-        # to_return = lora_state_dict(model, bias=model.peft_config.bias)
-        # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py`
-        # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP
-        bias = config.bias
-        if bias == "none":
-            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k}
-        elif bias == "all":
-            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k}
-        elif bias == "lora_only":
-            to_return = {}
-            for k in state_dict:
-                if "lora_" in k:
-                    to_return[k] = state_dict[k]
-                    bias_name = k.split("lora_")[0] + "bias"
-                    if bias_name in state_dict:
-                        to_return[bias_name] = state_dict[bias_name]
-        else:
-            raise NotImplementedError
-        to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))}
-        if config.peft_type == PeftType.ADALORA:
-            rank_pattern = config.rank_pattern
-            if rank_pattern is not None:
-                rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()}
-                config.rank_pattern = rank_pattern
-                to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name)
-    elif config.peft_type == PeftType.ADAPTION_PROMPT:
-        to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
-    elif config.peft_type == PeftType.IA3:
-        to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k}
-    elif config.peft_type == PeftType.LOKR:
-        to_return = {k: state_dict[k] for k in state_dict if "lokr_" in k}
-    elif config.peft_type == PeftType.POLY:
-        to_return = {k: state_dict[k] for k in state_dict if "poly_" in k}
-    elif config.peft_type == PeftType.LN_TUNING:
-        to_return = {k: state_dict[k] for k in state_dict if "ln_tuning_" in k}
-    elif config.peft_type == PeftType.LOHA:
-        to_return = {k: state_dict[k] for k in state_dict if "hada_" in k}
-    elif config.is_prompt_learning:
-        to_return = {}
-        if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-            to_return["prefix_task_cols"] = model.prompt_encoder[adapter_name].prefix_task_cols
-            to_return["prefix_task_rows"] = model.prompt_encoder[adapter_name].prefix_task_rows
-            prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
-        else:
-            if config.inference_mode:
-                prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
-            else:
-                prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
-        to_return["prompt_embeddings"] = prompt_embeddings
-        to_return = get_data_list(to_return)
-    else:
-        raise NotImplementedError
-
-    if model.modules_to_save is not None:
-        for key, value in state_dict.items():
-            if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
-                to_return[key.replace("modules_to_save.", "")] = value
-
-    to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
-    return to_return
-
-
-def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"):
-    """
-    Set the state dict of the Peft model.
-
-    Args:
-        model ([`PeftModel`]): The Peft model.
-        peft_model_state_dict (`dict`): The state dict of the Peft model.
-    """
-    config = model.peft_config[adapter_name]
-    state_dict = {}
-    strict_load = False
-    if model.modules_to_save is not None:
-        for key, value in peft_model_state_dict.items():
-            if any(module_name in key for module_name in model.modules_to_save):
-                for module_name in model.modules_to_save:
-                    if module_name in key:
-                        key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}")
-                        break
-            state_dict[key] = value
-    else:
-        state_dict = peft_model_state_dict
-
-    if config.peft_type in (
-        PeftType.LORA,
-        PeftType.IA3,
-        PeftType.ADALORA,
-        PeftType.LOKR,
-        PeftType.LOHA,
-        PeftType.POLY,
-        PeftType.LN_TUNING,
-    ):
-        peft_model_state_dict = {}
-        parameter_prefix = {
-            PeftType.IA3: "ia3_",
-            PeftType.LORA: "lora_",
-            PeftType.ADALORA: "lora_",
-            PeftType.LOKR: "lokr_",
-            PeftType.LOHA: "hada_",
-            PeftType.POLY: "poly_",
-            PeftType.LN_TUNING: "ln_tuning_",
-        }[config.peft_type]
-        for k, v in state_dict.items():
-            if parameter_prefix in k:
-                suffix = k.split(parameter_prefix)[1]
-                if "." in suffix:
-                    suffix_to_replace = ".".join(suffix.split(".")[1:])
-                    k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}")
-                else:
-                    k = f"{k}.{adapter_name}"
-                peft_model_state_dict[k] = v
-            else:
-                peft_model_state_dict[k] = v
-        if config.peft_type == PeftType.ADALORA:
-            strict_load = True
-            rank_pattern = config.rank_pattern
-            if rank_pattern is not None:
-                model.resize_modules_by_rank_pattern(rank_pattern, adapter_name)
-    elif config.is_prompt_learning or config.peft_type == PeftType.ADAPTION_PROMPT:
-        peft_model_state_dict = state_dict
-    else:
-        raise NotImplementedError
-
-    load_result = model.load_state_dict(peft_model_state_dict, strict=False)
-    if config.is_prompt_learning:
-        model.prompt_encoder[adapter_name].embedding.load_state_dict(
-            {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
-        )
-
-    if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
-        model.prompt_encoder[adapter_name].load_state_dict(peft_model_state_dict, strict=False)
-    return load_result
-
-
-def load_peft_weights(model_id: str,) -> dict:
-    r"""
-    A helper method to load the PEFT weights from local storage. Add download logic later.
-
-    Args:
-        model_id (`str`):
-            The local path to the adapter weights or the name of the adapter to load from the HuggingFace Hub.
-    """
-    path = model_id
-
-    filename = os.path.join(path, WEIGHTS_NAME)
-    if not os.path.exists(filename):
-        # TODO: add download logic later
-        raise ValueError(f"load peft model failed, peft model file: {filename} not exists.")
-
-    adapters_weights = mindspore.load_checkpoint(filename)
-
-    return adapters_weights
diff --git a/mindnlp/readme.md b/mindnlp/readme.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/sentence/models/__init__.py b/mindnlp/sentence/models/__init__.py
deleted file mode 100644
index 9e16fa87d..000000000
--- a/mindnlp/sentence/models/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Sentence transformer models init"""
-
-from .transformer import Transformer
-from .pooling import Pooling
-from .normalize import Normalize
-from .dense import Dense
-
-__all__ = [
-    "Transformer",
-    "Pooling",
-    "Normalize",
-    "Dense"
-]
diff --git a/mindnlp/sentence/models/dense.py b/mindnlp/sentence/models/dense.py
deleted file mode 100644
index 82fc9702f..000000000
--- a/mindnlp/sentence/models/dense.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""dense model"""
-from __future__ import annotations
-
-import json
-import os
-
-
-from mindspore import Tensor
-from mindnlp.core import nn
-from mindnlp.core.serialization import load_model as load_safetensors_model, save, load
-from mindnlp.core.serialization import save_model as save_safetensors_model
-
-from ..util import fullname, import_from_string
-
-
-class Dense(nn.Module):
-    """
-    Feed-forward function with activation function.
-
-    This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networks (DAN).
-
-    Args:
-        in_features: Size of the input dimension
-        out_features: Output size
-        bias: Add a bias vector
-        activation_function: Pytorch activation function applied on
-            output
-        init_weight: Initial value for the matrix of the linear layer
-        init_bias: Initial value for the bias of the linear layer
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        activation_function=nn.Tanh(),
-        init_weight: Tensor = None,
-        init_bias: Tensor = None,
-    ):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.bias = bias
-        self.activation_function = activation_function
-        self.linear = nn.Linear(in_features, out_features, bias=bias)
-
-        if init_weight is not None:
-            self.linear.weight = nn.Parameter(init_weight)
-
-        if init_bias is not None:
-            self.linear.bias = nn.Parameter(init_bias)
-
-    def forward(self, features: dict[str, Tensor]):
-        features.update({"sentence_embedding": self.activation_function(self.linear(features["sentence_embedding"]))})
-        return features
-
-    def get_sentence_embedding_dimension(self) -> int:
-        return self.out_features
-
-    def get_config_dict(self):
-        return {
-            "in_features": self.in_features,
-            "out_features": self.out_features,
-            "bias": self.bias,
-            "activation_function": fullname(self.activation_function),
-        }
-
-    def save(self, output_path, safe_serialization: bool = True) -> None:
-        with open(os.path.join(output_path, "config.json"), "w") as fOut:
-            json.dump(self.get_config_dict(), fOut)
-
-        if safe_serialization:
-            save_safetensors_model(self, os.path.join(output_path, "model.safetensors"))
-        else:
-            save(self.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
-
-    def __repr__(self):
-        return f"Dense({self.get_config_dict()})"
-
-    @staticmethod
-    def load(input_path):
-        with open(os.path.join(input_path, "config.json")) as fIn:
-            config = json.load(fIn)
-
-        config["activation_function"] = import_from_string(config["activation_function"])()
-        model = Dense(**config)
-        if os.path.exists(os.path.join(input_path, "model.safetensors")):
-            load_safetensors_model(model, os.path.join(input_path, "model.safetensors"))
-        else:
-            model.load_state_dict(
-                load(
-                    os.path.join(input_path, "pytorch_model.bin"), weights_only=True
-                )
-            )
-        return model
diff --git a/mindnlp/sentence/models/normalize.py b/mindnlp/sentence/models/normalize.py
deleted file mode 100644
index f1335c629..000000000
--- a/mindnlp/sentence/models/normalize.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""normalize model"""
-from __future__ import annotations
-
-from mindspore import Tensor
-from mindnlp.core.nn import functional as F
-from mindnlp.core import nn
-
-
-class Normalize(nn.Module):
-    """This layer normalizes embeddings to unit length"""
-
-    def forward(self, features: dict[str, Tensor]) -> dict[str, Tensor]:
-        features.update({"sentence_embedding": F.normalize(features["sentence_embedding"], p=2, dim=1)})
-        return features
-
-    def save(self, output_path) -> None:
-        pass
-
-    @staticmethod
-    def load(input_path) -> Normalize:
-        return Normalize()
diff --git a/mindnlp/sentence/models/pooling.py b/mindnlp/sentence/models/pooling.py
deleted file mode 100644
index 7e07d57c2..000000000
--- a/mindnlp/sentence/models/pooling.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Pooling module"""
-
-import json
-import os
-from typing import Dict
-
-import mindspore
-from mindspore import Tensor
-from mindnlp.core import nn, ops
-
-
-class Pooling(nn.Module):
-    """Performs pooling (max or mean) on the token embeddings.
-
-    Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows
-    to use the CLS token if it is returned by the underlying word embedding model. You can concatenate multiple poolings
-    together.
-
-    :param word_embedding_dimension: Dimensions for the word embeddings :param pooling_mode: Either "cls",
-    "lasttoken", "max", "mean", "mean_sqrt_len_tokens", or "weightedmean". If set, overwrites the other
-    pooling_mode_* settings :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
-    :param pooling_mode_max_tokens: Use max in each dimension over all tokens. :param pooling_mode_mean_tokens:
-    Perform mean-pooling :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(
-    input_length). :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling. See `SGPT: GPT
-    Sentence Embeddings for Semantic Search <https://arxiv.org/abs/2202.08904>`_. :param pooling_mode_lasttoken:
-    Perform last token pooling. See `SGPT: GPT Sentence Embeddings for Semantic Search
-    <https://arxiv.org/abs/2202.08904>`_ and `Text and Code Embeddings by Contrastive Pre-Training
-    <https://arxiv.org/abs/2201.10005>`_.
-    """
-
-    POOLING_MODES = (
-        "cls",
-        "lasttoken",
-        "max",
-        "mean",
-        "mean_sqrt_len_tokens",
-        "weightedmean",
-    )
-
-    def __init__(
-        self,
-        word_embedding_dimension: int,
-        pooling_mode: str = None,
-        pooling_mode_cls_token: bool = False,
-        pooling_mode_max_tokens: bool = False,
-        pooling_mode_mean_tokens: bool = True,
-        pooling_mode_mean_sqrt_len_tokens: bool = False,
-        pooling_mode_weightedmean_tokens: bool = False,
-        pooling_mode_lasttoken: bool = False,
-        include_prompt=True,
-    ) -> None:
-        super(Pooling, self).__init__()
-
-        self.config_keys = [
-            "word_embedding_dimension",
-            "pooling_mode_cls_token",
-            "pooling_mode_mean_tokens",
-            "pooling_mode_max_tokens",
-            "pooling_mode_mean_sqrt_len_tokens",
-            "pooling_mode_weightedmean_tokens",
-            "pooling_mode_lasttoken",
-            "include_prompt",
-        ]
-
-        if pooling_mode is not None:  # Set pooling mode by string
-            pooling_mode = pooling_mode.lower()
-
-            if pooling_mode not in self.POOLING_MODES:
-                raise ValueError(
-                    f"Set invalid pooling mode: {pooling_mode}. Valid pooling modes are: {self.POOLING_MODES}."
-                )
-
-            pooling_mode_cls_token = pooling_mode == "cls"
-            pooling_mode_max_tokens = pooling_mode == "max"
-            pooling_mode_mean_tokens = pooling_mode == "mean"
-            pooling_mode_mean_sqrt_len_tokens = pooling_mode == "mean_sqrt_len_tokens"
-            pooling_mode_weightedmean_tokens = pooling_mode == "weightedmean"
-            pooling_mode_lasttoken = pooling_mode == "lasttoken"
-
-        self.word_embedding_dimension = word_embedding_dimension
-        self.pooling_mode_cls_token = pooling_mode_cls_token
-        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
-        self.pooling_mode_max_tokens = pooling_mode_max_tokens
-        self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
-        self.pooling_mode_weightedmean_tokens = pooling_mode_weightedmean_tokens
-        self.pooling_mode_lasttoken = pooling_mode_lasttoken
-
-        self.include_prompt = include_prompt
-
-        pooling_mode_multiplier = sum(
-            [
-                pooling_mode_cls_token,
-                pooling_mode_max_tokens,
-                pooling_mode_mean_tokens,
-                pooling_mode_mean_sqrt_len_tokens,
-                pooling_mode_weightedmean_tokens,
-                pooling_mode_lasttoken,
-            ]
-        )
-        self.pooling_output_dimension = pooling_mode_multiplier * word_embedding_dimension
-
-    def __repr__(self):
-        return "Pooling({})".format(self.get_config_dict())
-
-    def get_pooling_mode_str(self) -> str:
-        """
-        Returns the pooling mode as string
-        """
-        modes = []
-        if self.pooling_mode_cls_token:
-            modes.append("cls")
-        if self.pooling_mode_mean_tokens:
-            modes.append("mean")
-        if self.pooling_mode_max_tokens:
-            modes.append("max")
-        if self.pooling_mode_mean_sqrt_len_tokens:
-            modes.append("mean_sqrt_len_tokens")
-        if self.pooling_mode_weightedmean_tokens:
-            modes.append("weightedmean")
-        if self.pooling_mode_lasttoken:
-            modes.append("lasttoken")
-
-        return "+".join(modes)
-
-    def forward(self, features: Dict[str, Tensor]):
-        token_embeddings = features["token_embeddings"]
-        attention_mask = features["attention_mask"]
-        if not self.include_prompt and "prompt_length" in features:
-            attention_mask[:, : features["prompt_length"]] = 0
-
-        ## Pooling strategy
-        output_vectors = []
-        if self.pooling_mode_cls_token:
-            cls_token = features.get("cls_token_embeddings", token_embeddings[:, 0])  # Take first token by default
-            output_vectors.append(cls_token)
-        if self.pooling_mode_max_tokens:
-            input_mask_expanded = (
-                attention_mask.unsqueeze(-1).broadcast_to(token_embeddings.shape).to(token_embeddings.dtype)
-            )
-            token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
-            max_over_time = ops.max(token_embeddings, 1)[0]
-            output_vectors.append(max_over_time)
-        if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
-            attention_mask = attention_mask.unsqueeze(-1)
-            token_embeddings_size = token_embeddings.shape
-            attention_mask = attention_mask.broadcast_to(token_embeddings_size)
-            input_mask_expanded = (
-                attention_mask.to(token_embeddings.dtype)
-            )
-            sum_embeddings = ops.sum(token_embeddings * input_mask_expanded, 1)
-
-            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
-            if "token_weights_sum" in features:
-                sum_mask = features["token_weights_sum"].unsqueeze(-1).broadcast_to(sum_embeddings.shape)
-            else:
-                sum_mask = input_mask_expanded.sum(1)
-
-            sum_mask = ops.clamp(sum_mask, min=1e-9)
-
-            if self.pooling_mode_mean_tokens:
-                output_vectors.append(sum_embeddings / sum_mask)
-            if self.pooling_mode_mean_sqrt_len_tokens:
-                output_vectors.append(sum_embeddings / ops.sqrt(sum_mask))
-        if self.pooling_mode_weightedmean_tokens:
-            input_mask_expanded = (
-                attention_mask.unsqueeze(-1).broadcast_to(token_embeddings.shape).to(token_embeddings.dtype)
-            )
-            # token_embeddings shape: bs, seq, hidden_dim
-            weights = (
-                ops.arange(start=1, end=token_embeddings.shape[1] + 1)
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .broadcast_to(token_embeddings.shape)
-                .to(token_embeddings.dtype)
-            )
-            assert weights.shape == token_embeddings.shape == input_mask_expanded.shape
-            input_mask_expanded = input_mask_expanded * weights
-
-            sum_embeddings = ops.sum(token_embeddings * input_mask_expanded, 1)
-
-            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
-            if "token_weights_sum" in features:
-                sum_mask = features["token_weights_sum"].unsqueeze(-1).broadcast_to(sum_embeddings.shape)
-            else:
-                sum_mask = input_mask_expanded.sum(1)
-
-            sum_mask = ops.clamp(sum_mask, min=1e-9)
-            output_vectors.append(sum_embeddings / sum_mask)
-        if self.pooling_mode_lasttoken:
-            bs, seq_len, hidden_dim = token_embeddings.shape
-            # attention_mask shape: (bs, seq_len)
-            # Get shape [bs] indices of the last token (i.e. the last token for each batch item)
-            # Use flip and max() to get the last index of 1 in the attention mask
-
-            if mindspore.jit.is_tracing():
-                # Avoid tracing the argmax with int64 input that can not be handled by ONNX Runtime: https://github.com/microsoft/onnxruntime/issues/10068
-                attention_mask = attention_mask.to(mindspore.int32)
-
-            values, indices = attention_mask.flip(1).max(1)
-            indices = mindspore.where(values == 0, seq_len - 1, indices)
-            gather_indices = seq_len - indices - 1
-
-            # Turn indices from shape [bs] --> [bs, 1, hidden_dim]
-            gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim)
-            gather_indices = gather_indices.unsqueeze(1)
-            assert gather_indices.shape == (bs, 1, hidden_dim)
-
-            # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim)
-            # Actually no need for the attention mask as we gather the last token where attn_mask = 1
-            # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we
-            # use the attention mask to ignore them again
-            input_mask_expanded = (
-                attention_mask.unsqueeze(-1).broadcast_to(token_embeddings.shape).to(token_embeddings.dtype)
-            )
-            embedding = ops.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1)
-            output_vectors.append(embedding)
-
-        output_vector = ops.cat(output_vectors, 1)
-        features.update({"sentence_embedding": output_vector})
-        return features
-
-    def get_sentence_embedding_dimension(self):
-        return self.pooling_output_dimension
-
-    def get_config_dict(self):
-        return {key: self.__dict__[key] for key in self.config_keys}
-
-    def save(self, output_path):
-        with open(os.path.join(output_path, "config.json"), "w") as fOut:
-            json.dump(self.get_config_dict(), fOut, indent=2)
-
-    @staticmethod
-    def load(input_path):
-        with open(os.path.join(input_path, "config.json")) as fIn:
-            config = json.load(fIn)
-
-        return Pooling(**config)
diff --git a/mindnlp/sentence/models/transformer.py b/mindnlp/sentence/models/transformer.py
deleted file mode 100644
index beb9ac4e3..000000000
--- a/mindnlp/sentence/models/transformer.py
+++ /dev/null
@@ -1,266 +0,0 @@
-"""transformer model"""
-from __future__ import annotations
-
-import json
-import logging
-import os
-from pathlib import Path
-from typing import Any, Callable
-
-import mindspore
-from ...core import nn, ops
-from ...transformers import AutoConfig, AutoModel, AutoTokenizer, MT5Config, T5Config
-from ...utils.peft_utils import find_adapter_config_file
-from ...peft import PeftConfig, PeftModel, PeftModelForFeatureExtraction
-
-logger = logging.getLogger(__name__)
-
-
-def _save_pretrained_wrapper(_save_pretrained_fn: Callable, subfolder: str) -> Callable[..., None]:
-    def wrapper(save_directory: str | Path, **kwargs) -> None:
-        os.makedirs(Path(save_directory) / subfolder, exist_ok=True)
-        return _save_pretrained_fn(Path(save_directory) / subfolder, **kwargs)
-
-    return wrapper
-
-
-class Transformer(nn.Module):
-    """Hugging Face AutoModel to generate token embeddings.
-    Loads the correct class, e.g. BERT / RoBERTa etc.
-
-    Args:
-        model_name_or_path: Hugging Face models name
-            (https://huggingface.co/models)
-        max_seq_length: Truncate any inputs longer than max_seq_length
-        model_args: Keyword arguments passed to the Hugging Face
-            Transformers model
-        tokenizer_args: Keyword arguments passed to the Hugging Face
-            Transformers tokenizer
-        config_args: Keyword arguments passed to the Hugging Face
-            Transformers config
-        cache_dir: Cache dir for Hugging Face Transformers to store/load
-            models
-        do_lower_case: If true, lowercases the input (independent if the
-            model is cased or not)
-        tokenizer_name_or_path: Name or path of the tokenizer. When
-            None, then model_name_or_path is used
-    """
-
-    save_in_root: bool = True
-
-    def __init__(
-        self,
-        model_name_or_path: str,
-        max_seq_length: int | None = None,
-        model_args: dict[str, Any] | None = None,
-        tokenizer_args: dict[str, Any] | None = None,
-        config_args: dict[str, Any] | None = None,
-        cache_dir: str | None = None,
-        do_lower_case: bool = False,
-        tokenizer_name_or_path: str = None,
-    ) -> None:
-        super().__init__()
-        self.config_keys = ["max_seq_length", "do_lower_case"]
-        self.do_lower_case = do_lower_case
-        if model_args is None:
-            model_args = {}
-        if tokenizer_args is None:
-            tokenizer_args = {}
-        if config_args is None:
-            config_args = {}
-
-        config = self._load_config(model_name_or_path, cache_dir, config_args)
-        self._load_model(model_name_or_path, config, cache_dir, **model_args)
-
-        if max_seq_length is not None and "model_max_length" not in tokenizer_args:
-            tokenizer_args["model_max_length"] = max_seq_length
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
-            cache_dir=cache_dir,
-            **tokenizer_args,
-        )
-
-        # No max_seq_length set. Try to infer from model
-        if max_seq_length is None:
-            if (
-                hasattr(self.auto_model, "config")
-                and hasattr(self.auto_model.config, "max_position_embeddings")
-                and hasattr(self.tokenizer, "model_max_length")
-            ):
-                max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)
-
-        self.max_seq_length = max_seq_length
-
-        if tokenizer_name_or_path is not None:
-            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
-
-    def _load_config(self, model_name_or_path: str, cache_dir: str | None, config_args: dict[str, Any]):
-        """Loads the configuration of a model"""
-        if (
-            find_adapter_config_file(
-                model_name_or_path,
-                token=config_args.get("token"),
-                revision=config_args.get("revision"),
-                local_files_only=config_args.get("local_files_only", False),
-            )
-            is not None
-        ):
-
-            return PeftConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
-
-        return AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
-
-    def _load_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
-        """Loads the transformer model"""
-        if isinstance(config, T5Config):
-            self._load_t5_model(model_name_or_path, config, cache_dir, **model_args)
-        elif isinstance(config, MT5Config):
-            self._load_mt5_model(model_name_or_path, config, cache_dir, **model_args)
-        else:
-            self.auto_model = AutoModel.from_pretrained(
-                model_name_or_path, config=config, cache_dir=cache_dir, **model_args
-            )
-        self._load_peft_model(model_name_or_path, config, cache_dir, **model_args)
-
-    def _load_peft_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
-        if isinstance(config, PeftConfig):
-            self.auto_model = PeftModel.from_pretrained(
-                self.auto_model, model_name_or_path, config=config, cache_dir=cache_dir, **model_args
-            )
-
-
-    def _load_t5_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
-        """Loads the encoder model from T5"""
-        from mindnlp.transformers import T5EncoderModel
-
-        T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
-        self.auto_model = T5EncoderModel.from_pretrained(
-            model_name_or_path, config=config, cache_dir=cache_dir, **model_args
-        )
-
-    def _load_mt5_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
-        """Loads the encoder model from T5"""
-        from mindnlp.transformers import MT5EncoderModel
-
-        MT5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
-        self.auto_model = MT5EncoderModel.from_pretrained(
-            model_name_or_path, config=config, cache_dir=cache_dir, **model_args
-        )
-
-    def __repr__(self) -> str:
-        return f"Transformer({self.get_config_dict()}) with Transformer model: {self.auto_model.__class__.__name__} "
-
-    def forward(self, features: dict[str, mindspore.Tensor], **kwargs) -> dict[str, mindspore.Tensor]:
-        """Returns token_embeddings, cls_token"""
-        trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
-        if "token_type_ids" in features:
-            trans_features["token_type_ids"] = features["token_type_ids"]
-
-        output_states = self.auto_model(**trans_features, **kwargs, return_dict=False)
-        output_tokens = output_states[0]
-
-        # If the AutoModel is wrapped with a PeftModelForFeatureExtraction, then it may have added virtual tokens
-        # We need to extend the attention mask to include these virtual tokens, or the pooling will fail
-
-        if (
-            isinstance(self.auto_model, PeftModelForFeatureExtraction)
-            and self.auto_model.active_peft_config.is_prompt_learning
-        ):
-            batch_size = output_tokens.size(0)
-            attention_mask = features["attention_mask"]
-            prefix_attention_mask = ops.ones(
-                batch_size, self.auto_model.active_peft_config.num_virtual_tokens
-            )
-            features["attention_mask"] = ops.cat((prefix_attention_mask, attention_mask), dim=1)
-
-        features["token_embeddings"] = output_tokens
-
-        if self.auto_model.config.output_hidden_states and len(output_states) > 2:
-            all_layer_idx = 2  # I.e. after last_hidden_states and pooler_output
-            if len(output_states) < 3:  # Some models only output last_hidden_states and all_hidden_states
-                all_layer_idx = 1
-
-            hidden_states = output_states[all_layer_idx]
-            features["all_layer_embeddings"] = hidden_states
-
-        return features
-
-    def get_word_embedding_dimension(self) -> int:
-        return self.auto_model.config.hidden_size
-
-    def tokenize(
-        self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True
-    ) -> dict[str, mindspore.Tensor]:
-        """Tokenizes a text and maps tokens to token-ids"""
-        output = {}
-        if isinstance(texts[0], str):
-            to_tokenize = [texts]
-        elif isinstance(texts[0], dict):
-            to_tokenize = []
-            output["text_keys"] = []
-            for lookup in texts:
-                text_key, text = next(iter(lookup.items()))
-                to_tokenize.append(text)
-                output["text_keys"].append(text_key)
-            to_tokenize = [to_tokenize]
-        else:
-            batch1, batch2 = [], []
-            for text_tuple in texts:
-                batch1.append(text_tuple[0])
-                batch2.append(text_tuple[1])
-            to_tokenize = [batch1, batch2]
-
-        # strip
-        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
-
-        # Lowercase
-        if self.do_lower_case:
-            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
-
-        output.update(
-            self.tokenizer(
-                *to_tokenize,
-                padding=padding,
-                truncation="longest_first",
-                return_tensors="ms",
-                max_length=self.max_seq_length,
-            )
-        )
-        return output
-
-    def get_config_dict(self) -> dict[str, Any]:
-        return {key: self.__dict__[key] for key in self.config_keys}
-
-    def save(self, output_path: str, safe_serialization: bool = True) -> None:
-        self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
-        self.tokenizer.save_pretrained(output_path)
-
-        with open(os.path.join(output_path, "sentence_bert_config.json"), "w") as fOut:
-            json.dump(self.get_config_dict(), fOut, indent=2)
-
-    @classmethod
-    def load(cls, input_path: str) -> Transformer:
-        # Old classes used other config names than 'sentence_bert_config.json'
-        for config_name in [
-            "sentence_bert_config.json",
-            "sentence_roberta_config.json",
-            "sentence_distilbert_config.json",
-            "sentence_camembert_config.json",
-            "sentence_albert_config.json",
-            "sentence_xlm-roberta_config.json",
-            "sentence_xlnet_config.json",
-        ]:
-            sbert_config_path = os.path.join(input_path, config_name)
-            if os.path.exists(sbert_config_path):
-                break
-
-        with open(sbert_config_path) as fIn:
-            config = json.load(fIn)
-        # Don't allow configs to set trust_remote_code
-        if "model_args" in config and "trust_remote_code" in config["model_args"]:
-            config["model_args"].pop("trust_remote_code")
-        if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
-            config["tokenizer_args"].pop("trust_remote_code")
-        if "config_args" in config and "trust_remote_code" in config["config_args"]:
-            config["config_args"].pop("trust_remote_code")
-        return cls(model_name_or_path=input_path, **config)
diff --git a/mindnlp/sentence/sentence_transformer.py b/mindnlp/sentence/sentence_transformer.py
deleted file mode 100644
index 623e60dd4..000000000
--- a/mindnlp/sentence/sentence_transformer.py
+++ /dev/null
@@ -1,629 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Sentence Transformer"""
-import os
-import json
-from collections import OrderedDict
-from typing import Optional, Iterable, Dict, Union, List, Literal, Tuple, Any
-from pathlib import Path
-
-import numpy as np
-from tqdm import trange
-
-import mindspore
-from mindspore import Tensor
-from mindnlp.core import nn, ops, no_grad
-from mindnlp.utils import logging
-from .models import Transformer, Pooling, Normalize
-from .util import (
-    truncate_embeddings,
-    is_sentence_transformer_model,
-    load_file_path,
-    import_from_string,
-    load_dir_path
-)
-from .similarity_functions import SimilarityFunction
-
-logger = logging.get_logger(__name__)
-
-__MODEL_HUB_ORGANIZATION__ = "sentence-transformers"
-
-class SentenceTransformer(nn.Sequential):
-    def __init__(
-            self,
-            model_name_or_path: str = None,
-            modules: Iterable[nn.Module] = None,
-            prompts: dict[str, str] = None,
-            default_prompt_name: str = None,
-            similarity_fn_name: str = None,
-            cache_folder: str = None,
-            trust_remote_code: bool = False,
-            revision: str = None,
-            local_files_only: bool = False,
-            token: bool = None,
-            truncate_dim: int = None,
-            model_kwargs: dict[str, Any] = None,
-            tokenizer_kwargs: dict[str, Any] = None,
-            config_kwargs: dict[str, Any] = None,
-    ):
-        self.prompts = prompts or {}
-        self.default_prompt_name = default_prompt_name
-        self.similarity_fn_name = similarity_fn_name
-        self.trust_remote_code = trust_remote_code
-        self.truncate_dim = truncate_dim
-        self.module_kwargs = None
-        self._model_card_vars = {}
-        self._model_card_text = None
-        self._model_config = {}
-
-        if model_name_or_path is not None and model_name_or_path != "":
-            logger.info(f"Load pretrained SentenceTransformer: {model_name_or_path}")
-
-            # Old models that don't belong to any organization
-            basic_transformer_models = [
-                "albert-base-v1",
-                "albert-base-v2",
-                "albert-large-v1",
-                "albert-large-v2",
-                "albert-xlarge-v1",
-                "albert-xlarge-v2",
-                "albert-xxlarge-v1",
-                "albert-xxlarge-v2",
-                "bert-base-cased-finetuned-mrpc",
-                "bert-base-cased",
-                "bert-base-chinese",
-                "bert-base-german-cased",
-                "bert-base-german-dbmdz-cased",
-                "bert-base-german-dbmdz-uncased",
-                "bert-base-multilingual-cased",
-                "bert-base-multilingual-uncased",
-                "bert-base-uncased",
-                "bert-large-cased-whole-word-masking-finetuned-squad",
-                "bert-large-cased-whole-word-masking",
-                "bert-large-cased",
-                "bert-large-uncased-whole-word-masking-finetuned-squad",
-                "bert-large-uncased-whole-word-masking",
-                "bert-large-uncased",
-                "camembert-base",
-                "ctrl",
-                "distilbert-base-cased-distilled-squad",
-                "distilbert-base-cased",
-                "distilbert-base-german-cased",
-                "distilbert-base-multilingual-cased",
-                "distilbert-base-uncased-distilled-squad",
-                "distilbert-base-uncased-finetuned-sst-2-english",
-                "distilbert-base-uncased",
-                "distilgpt2",
-                "distilroberta-base",
-                "gpt2-large",
-                "gpt2-medium",
-                "gpt2-xl",
-                "gpt2",
-                "openai-gpt",
-                "roberta-base-openai-detector",
-                "roberta-base",
-                "roberta-large-mnli",
-                "roberta-large-openai-detector",
-                "roberta-large",
-                "t5-11b",
-                "t5-3b",
-                "t5-base",
-                "t5-large",
-                "t5-small",
-                "transfo-xl-wt103",
-                "xlm-clm-ende-1024",
-                "xlm-clm-enfr-1024",
-                "xlm-mlm-100-1280",
-                "xlm-mlm-17-1280",
-                "xlm-mlm-en-2048",
-                "xlm-mlm-ende-1024",
-                "xlm-mlm-enfr-1024",
-                "xlm-mlm-enro-1024",
-                "xlm-mlm-tlm-xnli15-1024",
-                "xlm-mlm-xnli15-1024",
-                "xlm-roberta-base",
-                "xlm-roberta-large-finetuned-conll02-dutch",
-                "xlm-roberta-large-finetuned-conll02-spanish",
-                "xlm-roberta-large-finetuned-conll03-english",
-                "xlm-roberta-large-finetuned-conll03-german",
-                "xlm-roberta-large",
-                "xlnet-base-cased",
-                "xlnet-large-cased",
-            ]
-
-            if not os.path.exists(model_name_or_path):
-                # Not a path, load from hub
-                if "\\" in model_name_or_path or model_name_or_path.count("/") > 1:
-                    raise ValueError(f"Path {model_name_or_path} not found")
-
-                if "/" not in model_name_or_path and model_name_or_path.lower() not in basic_transformer_models:
-                    # A model from sentence-transformers
-                    model_name_or_path = __MODEL_HUB_ORGANIZATION__ + "/" + model_name_or_path
-
-            if is_sentence_transformer_model(
-                model_name_or_path,
-                token,
-                cache_folder=cache_folder,
-                revision=revision,
-                local_files_only=local_files_only,
-            ):
-                modules, self.module_kwargs = self._load_sbert_model(
-                    model_name_or_path,
-                    token=token,
-                    cache_folder=cache_folder,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    local_files_only=local_files_only,
-                    model_kwargs=model_kwargs,
-                    tokenizer_kwargs=tokenizer_kwargs,
-                    config_kwargs=config_kwargs,
-                )
-            else:
-                modules = self._load_auto_model(
-                    model_name_or_path,
-                    token=token,
-                    cache_folder=cache_folder,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    local_files_only=local_files_only,
-                    model_kwargs=model_kwargs,
-                    tokenizer_kwargs=tokenizer_kwargs,
-                    config_kwargs=config_kwargs,
-                )
-
-        if modules is not None and not isinstance(modules, OrderedDict):
-            modules = OrderedDict([(str(idx), module) for idx, module in enumerate(modules)])
-
-        super().__init__(modules)
-
-    def _load_module_class_from_ref(
-        self,
-        class_ref: str,
-        model_name_or_path: str,
-        trust_remote_code: bool,
-        revision: str,
-        model_kwargs: dict[str, Any],
-    ) -> nn.Module:
-        # If the class is from sentence_transformers, we can directly import it,
-        # otherwise, we try to import it dynamically, and if that fails, we fall back to the default import
-        if class_ref.startswith("sentence_transformers."):
-            return import_from_string(class_ref)
-
-        return import_from_string(class_ref)
-
-    def _load_sbert_model(
-        self,
-        model_name_or_path: str,
-        token: str,
-        cache_folder: str,
-        revision: str = None,
-        trust_remote_code: bool = False,
-        local_files_only: bool = False,
-        model_kwargs: dict[str, Any] = None,
-        tokenizer_kwargs: dict[str, Any] = None,
-        config_kwargs: dict[str, Any] = None,
-    ) -> dict[str, nn.Module]:
-        """
-        Loads a full SentenceTransformer model using the modules.json file.
-
-        Args:
-            model_name_or_path (str): The name or path of the pre-trained model.
-            token (Optional[Union[bool, str]]): The token to use for the model.
-            cache_folder (Optional[str]): The folder to cache the model.
-            revision (Optional[str], optional): The revision of the model. Defaults to None.
-            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
-            local_files_only (bool, optional): Whether to use only local files. Defaults to False.
-            model_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the model. Defaults to None.
-            tokenizer_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the tokenizer. Defaults to None.
-            config_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the config. Defaults to None.
-
-        Returns:
-            OrderedDict[str, nn.Module]: An ordered dictionary containing the modules of the model.
-        """
-        # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
-        config_sentence_transformers_json_path = load_file_path(
-            model_name_or_path,
-            "config_sentence_transformers.json",
-            token=token,
-            cache_folder=cache_folder,
-            revision=revision,
-            local_files_only=local_files_only,
-        )
-        if config_sentence_transformers_json_path is not None:
-            with open(config_sentence_transformers_json_path) as fIn:
-                self._model_config = json.load(fIn)
-
-            # Set score functions & prompts if not already overridden by the __init__ calls
-            if self._similarity_fn_name is None:
-                self.similarity_fn_name = self._model_config.get("similarity_fn_name", None)
-            if not self.prompts:
-                self.prompts = self._model_config.get("prompts", {})
-            if not self.default_prompt_name:
-                self.default_prompt_name = self._model_config.get("default_prompt_name", None)
-
-        # Check if a readme exists
-        model_card_path = load_file_path(
-            model_name_or_path,
-            "README.md",
-            token=token,
-            cache_folder=cache_folder,
-            revision=revision,
-            local_files_only=local_files_only,
-        )
-        if model_card_path is not None:
-            try:
-                with open(model_card_path, encoding="utf8") as fIn:
-                    self._model_card_text = fIn.read()
-            except Exception:
-                pass
-
-        # Load the modules of sentence transformer
-        modules_json_path = load_file_path(
-            model_name_or_path,
-            "modules.json",
-            token=token,
-            cache_folder=cache_folder,
-            revision=revision,
-            local_files_only=local_files_only,
-        )
-        with open(modules_json_path) as fIn:
-            modules_config = json.load(fIn)
-
-        modules = OrderedDict()
-        module_kwargs = OrderedDict()
-        for module_config in modules_config:
-            class_ref = module_config["type"]
-            module_class = self._load_module_class_from_ref(
-                class_ref, model_name_or_path, trust_remote_code, revision, model_kwargs
-            )
-
-            # For Transformer, don't load the full directory, rely on `transformers` instead
-            # But, do load the config file first.
-            if module_config["path"] == "":
-                kwargs = {}
-                for config_name in [
-                    "sentence_bert_config.json",
-                    "sentence_roberta_config.json",
-                    "sentence_distilbert_config.json",
-                    "sentence_camembert_config.json",
-                    "sentence_albert_config.json",
-                    "sentence_xlm-roberta_config.json",
-                    "sentence_xlnet_config.json",
-                ]:
-                    config_path = load_file_path(
-                        model_name_or_path,
-                        config_name,
-                        token=token,
-                        cache_folder=cache_folder,
-                        revision=revision,
-                        local_files_only=local_files_only,
-                    )
-                    if config_path is not None:
-                        with open(config_path) as fIn:
-                            kwargs = json.load(fIn)
-                            # Don't allow configs to set trust_remote_code
-                            if "model_args" in kwargs and "trust_remote_code" in kwargs["model_args"]:
-                                kwargs["model_args"].pop("trust_remote_code")
-                            if "tokenizer_args" in kwargs and "trust_remote_code" in kwargs["tokenizer_args"]:
-                                kwargs["tokenizer_args"].pop("trust_remote_code")
-                            if "config_args" in kwargs and "trust_remote_code" in kwargs["config_args"]:
-                                kwargs["config_args"].pop("trust_remote_code")
-                        break
-
-                hub_kwargs = {
-                    "token": token,
-                    "trust_remote_code": trust_remote_code,
-                    "revision": revision,
-                    "local_files_only": local_files_only,
-                }
-                # 3rd priority: config file
-                if "model_args" not in kwargs:
-                    kwargs["model_args"] = {}
-                if "tokenizer_args" not in kwargs:
-                    kwargs["tokenizer_args"] = {}
-                if "config_args" not in kwargs:
-                    kwargs["config_args"] = {}
-
-                # 2nd priority: hub_kwargs
-                kwargs["model_args"].update(hub_kwargs)
-                kwargs["tokenizer_args"].update(hub_kwargs)
-                kwargs["config_args"].update(hub_kwargs)
-
-                # 1st priority: kwargs passed to SentenceTransformer
-                if model_kwargs:
-                    kwargs["model_args"].update(model_kwargs)
-                if tokenizer_kwargs:
-                    kwargs["tokenizer_args"].update(tokenizer_kwargs)
-                if config_kwargs:
-                    kwargs["config_args"].update(config_kwargs)
-
-                # Try to initialize the module with a lot of kwargs, but only if the module supports them
-                # Otherwise we fall back to the load method
-                try:
-                    module = module_class(model_name_or_path, cache_dir=cache_folder, **kwargs)
-                except TypeError:
-                    module = module_class.load(model_name_or_path)
-            else:
-                # Normalize does not require any files to be loaded
-                if module_class == Normalize:
-                    module_path = None
-                else:
-                    module_path = load_dir_path(
-                        model_name_or_path,
-                        module_config["path"],
-                        token=token,
-                        cache_folder=cache_folder,
-                        revision=revision,
-                        local_files_only=local_files_only,
-                    )
-                module = module_class.load(module_path)
-
-            modules[module_config["name"]] = module
-            module_kwargs[module_config["name"]] = module_config.get("kwargs", [])
-
-        if revision is None:
-            path_parts = Path(modules_json_path)
-            if len(path_parts.parts) >= 2:
-                revision_path_part = Path(modules_json_path).parts[-2]
-                if len(revision_path_part) == 40:
-                    revision = revision_path_part
-
-        return modules, module_kwargs
-
-    def _load_auto_model(
-        self,
-        model_name_or_path: str,
-        token: str,
-        cache_folder: str,
-        revision: str = None,
-        trust_remote_code: bool = False,
-        local_files_only: bool = False,
-        model_kwargs: dict[str, Any] = None,
-        tokenizer_kwargs: dict[str, Any] = None,
-        config_kwargs: dict[str, Any] = None,
-    ) -> list[nn.Module]:
-        """
-        Creates a simple Transformer + Mean Pooling model and returns the modules
-
-        Args:
-            model_name_or_path (str): The name or path of the pre-trained model.
-            token (Optional[Union[bool, str]]): The token to use for the model.
-            cache_folder (Optional[str]): The folder to cache the model.
-            revision (Optional[str], optional): The revision of the model. Defaults to None.
-            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
-            local_files_only (bool, optional): Whether to use only local files. Defaults to False.
-            model_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the model. Defaults to None.
-            tokenizer_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the tokenizer. Defaults to None.
-            config_kwargs (Optional[Dict[str, Any]], optional): Additional keyword arguments for the config. Defaults to None.
-
-        Returns:
-            List[nn.Module]: A list containing the transformer model and the pooling model.
-        """
-        logger.warning(
-            f"No sentence-transformers model found with name {model_name_or_path}. Creating a new one with mean pooling."
-        )
-        shared_kwargs = {
-            "token": token,
-            "trust_remote_code": trust_remote_code,
-            "revision": revision,
-            "local_files_only": local_files_only,
-        }
-        model_kwargs = shared_kwargs if model_kwargs is None else {**shared_kwargs, **model_kwargs}
-        tokenizer_kwargs = shared_kwargs if tokenizer_kwargs is None else {**shared_kwargs, **tokenizer_kwargs}
-        config_kwargs = shared_kwargs if config_kwargs is None else {**shared_kwargs, **config_kwargs}
-
-        transformer_model = Transformer(
-            model_name_or_path,
-            cache_dir=cache_folder,
-            model_args=model_kwargs,
-            tokenizer_args=tokenizer_kwargs,
-            config_args=config_kwargs,
-        )
-        pooling_model = Pooling(transformer_model.get_word_embedding_dimension(), "mean")
-        # self.model_card_data.set_base_model(model_name_or_path, revision=revision)
-        return [transformer_model, pooling_model]
-
-    def _first_module(self):
-        """Returns the first module of this sequential embedder"""
-        return self._modules[next(iter(self._modules))]
-
-    def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
-        """
-        Tokenizes the texts
-        """
-        return self._first_module().tokenize(texts)
-
-    def _text_length(self, text: Union[List[int], List[List[int]]]):
-        """
-        Help function to get the length for the input text. Text can be either
-        a list of ints (which means a single text as input), or a tuple of list of ints
-        (representing several text inputs to the model).
-        """
-
-        if isinstance(text, dict):  # {key: value} case
-            return len(next(iter(text.values())))
-        elif not hasattr(text, "__len__"):  # Object has no len() method
-            return 1
-        elif len(text) == 0 or isinstance(text[0], int):  # Empty string or list of ints
-            return len(text)
-        else:
-            return sum(len(t) for t in text)  # Sum of length of individual strings
-
-    def encode(
-        self,
-        sentences: Union[str, List[str]],
-        prompt_name: Optional[str] = None,
-        prompt: Optional[str] = None,
-        batch_size: int = 32,
-        show_progress_bar: bool = None,
-        output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = "sentence_embedding",
-        precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
-        convert_to_numpy: bool = True,
-        convert_to_tensor: bool = False,
-        normalize_embeddings: bool = False,
-    ) -> Union[List[Tensor], Tensor]:
-        self.eval()
-        if show_progress_bar is None:
-            show_progress_bar = logger.getEffectiveLevel() in (logging.INFO, logging.DEBUG)
-
-        if convert_to_tensor:
-            convert_to_numpy = False
-
-        if output_value != "sentence_embedding":
-            convert_to_tensor = False
-            convert_to_numpy = False
-
-        input_was_string = False
-        if isinstance(sentences, str) or not hasattr(
-                sentences, "__len__"
-        ):  # Cast an individual sentence to a list with length 1
-            sentences = [sentences]
-            input_was_string = True
-
-        if prompt is None:
-            if prompt_name is not None:
-                try:
-                    prompt = self.prompts[prompt_name]
-                except KeyError:
-                    raise ValueError(
-                        f"Prompt name '{prompt_name}' not found in the configured prompts dictionary with keys {list(self.prompts.keys())!r}."
-                    )
-            elif self.default_prompt_name is not None:
-                prompt = self.prompts.get(self.default_prompt_name, None)
-        else:
-            if prompt_name is not None:
-                logger.warning(
-                    "Encode with either a `prompt`, a `prompt_name`, or neither, but not both. "
-                    "Ignoring the `prompt_name` in favor of `prompt`."
-                )
-
-        extra_features = {}
-        if prompt is not None:
-            sentences = [prompt + sentence for sentence in sentences]
-
-            # Some models (e.g. INSTRUCTOR, GRIT) require removing the prompt before pooling
-            # Tracking the prompt length allow us to remove the prompt during pooling
-            tokenized_prompt = self.tokenize([prompt])
-            if "input_ids" in tokenized_prompt:
-                extra_features["prompt_length"] = tokenized_prompt["input_ids"].shape[-1] - 1
-
-        all_embeddings = []
-        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
-        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
-
-        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
-            sentences_batch = sentences_sorted[start_index: start_index + batch_size]
-            features = self.tokenize(sentences_batch)
-            features.update(extra_features)
-
-            with no_grad():
-                out_features = self.forward(features)
-
-                out_features["sentence_embedding"] = truncate_embeddings(
-                    out_features["sentence_embedding"], self.truncate_dim
-                )
-
-                if output_value == "token_embeddings":
-                    embeddings = []
-                    for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]):
-                        last_mask_id = len(attention) - 1
-                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
-                            last_mask_id -= 1
-
-                        embeddings.append(token_emb[0: last_mask_id + 1])
-                elif output_value is None:  # Return all outputs
-                    embeddings = []
-                    for sent_idx in range(len(out_features["sentence_embedding"])):
-                        row = {name: out_features[name][sent_idx] for name in out_features}
-                        embeddings.append(row)
-                else:  # Sentence embeddings
-                    embeddings = out_features[output_value]
-                    if normalize_embeddings:
-                        embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
-
-                all_embeddings.extend(embeddings)
-
-        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
-
-        # if precision and precision != "float32":
-        #     all_embeddings = quantize_embeddings(all_embeddings, precision=precision)
-
-        if convert_to_tensor:
-            if len(all_embeddings):
-                if isinstance(all_embeddings, np.ndarray):
-                    all_embeddings = ops.from_numpy(all_embeddings)
-                else:
-                    all_embeddings = ops.stack(all_embeddings)
-            else:
-                all_embeddings = mindspore.Tensor()
-        elif convert_to_numpy:
-            if not isinstance(all_embeddings, np.ndarray):
-                if all_embeddings and all_embeddings[0].dtype == mindspore.bfloat16:
-                    all_embeddings = np.asarray([emb.float().asnumpy() for emb in all_embeddings])
-                else:
-                    all_embeddings = np.asarray([emb.asnumpy() for emb in all_embeddings])
-        elif isinstance(all_embeddings, np.ndarray):
-            all_embeddings = [ops.from_numpy(embedding) for embedding in all_embeddings]
-
-        if input_was_string:
-            all_embeddings = all_embeddings[0]
-
-        return all_embeddings
-
-    def encode_texts(self, texts: List[str]) -> List[List[float]]:
-        texts = [t.replace("\n", " ") for t in texts]
-        embeddings = self.encode(texts)
-        for i, embedding in enumerate(embeddings):
-            embeddings[i] = embedding.tolist()
-        return embeddings
-
-    def forward(self, input: dict[str, Tensor], **kwargs) -> dict[str, Tensor]:
-        if self.module_kwargs is None:
-            return super().forward(input)
-
-        for module_name, module in self.named_children():
-            module_kwarg_keys = self.module_kwargs.get(module_name, [])
-            module_kwargs = {key: value for key, value in kwargs.items() if key in module_kwarg_keys}
-            input = module(input, **module_kwargs)
-        return input
-
-    @property
-    def similarity_fn_name(self) -> Literal["cosine", "dot", "euclidean", "manhattan"]:
-        """Return the name of the similarity function used by :meth:`SentenceTransformer.similarity` and :meth:`SentenceTransformer.similarity_pairwise`.
-
-        Returns:
-            Optional[str]: The name of the similarity function. Can be None if not set, in which case it will
-                default to "cosine" when first called.
-
-        Example:
-            >>> model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
-            >>> model.similarity_fn_name
-            'dot'
-        """
-        if self._similarity_fn_name is None:
-            self.similarity_fn_name = SimilarityFunction.COSINE
-        return self._similarity_fn_name
-
-    @similarity_fn_name.setter
-    def similarity_fn_name(
-        self, value: Literal["cosine", "dot", "euclidean", "manhattan"]
-    ) -> None:
-        if isinstance(value, SimilarityFunction):
-            value = value.value
-        self._similarity_fn_name = value
-
-        if value is not None:
-            self._similarity = SimilarityFunction.to_similarity_fn(value)
-            self._similarity_pairwise = SimilarityFunction.to_similarity_pairwise_fn(value)
diff --git a/mindnlp/sentence/similarity_functions.py b/mindnlp/sentence/similarity_functions.py
deleted file mode 100644
index 88e931b94..000000000
--- a/mindnlp/sentence/similarity_functions.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""similaritiy functions"""
-from __future__ import annotations
-
-from enum import Enum
-from typing import Callable
-
-from numpy import ndarray
-from mindspore import Tensor
-
-from .util import (
-    cos_sim,
-    dot_score,
-    euclidean_sim,
-    manhattan_sim,
-    pairwise_cos_sim,
-    pairwise_dot_score,
-    pairwise_euclidean_sim,
-    pairwise_manhattan_sim,
-)
-
-
-class SimilarityFunction(Enum):
-    """
-    Enum class for supported similarity functions. The following functions are supported:
-
-    - ``SimilarityFunction.COSINE`` (``"cosine"``): Cosine similarity
-    - ``SimilarityFunction.DOT_PRODUCT`` (``"dot"``, ``dot_product``): Dot product similarity
-    - ``SimilarityFunction.EUCLIDEAN`` (``"euclidean"``): Euclidean distance
-    - ``SimilarityFunction.MANHATTAN`` (``"manhattan"``): Manhattan distance
-    """
-
-    COSINE = "cosine"
-    DOT_PRODUCT = "dot"
-    DOT = "dot"  # Alias for DOT_PRODUCT
-    EUCLIDEAN = "euclidean"
-    MANHATTAN = "manhattan"
-
-    @staticmethod
-    def to_similarity_fn(
-        similarity_function: str | SimilarityFunction,
-    ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]:
-        """
-        Converts a similarity function name or enum value to the corresponding similarity function.
-
-        Args:
-            similarity_function (Union[str, SimilarityFunction]): The name or enum value of the similarity function.
-
-        Returns:
-            Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: The corresponding similarity function.
-
-        Raises:
-            ValueError: If the provided function is not supported.
-
-        Example:
-            >>> similarity_fn = SimilarityFunction.to_similarity_fn("cosine")
-            >>> similarity_scores = similarity_fn(embeddings1, embeddings2)
-            >>> similarity_scores
-            tensor([[0.3952, 0.0554],
-                    [0.0992, 0.1570]])
-        """
-        similarity_function = SimilarityFunction(similarity_function)
-
-        if similarity_function == SimilarityFunction.COSINE:
-            return cos_sim
-        if similarity_function == SimilarityFunction.DOT_PRODUCT:
-            return dot_score
-        if similarity_function == SimilarityFunction.MANHATTAN:
-            return manhattan_sim
-        if similarity_function == SimilarityFunction.EUCLIDEAN:
-            return euclidean_sim
-
-        raise ValueError(
-            f"The provided function {similarity_function} is not supported. Use one of the supported values: {SimilarityFunction.possible_values()}."
-        )
-
-    @staticmethod
-    def to_similarity_pairwise_fn(
-        similarity_function: str | SimilarityFunction,
-    ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]:
-        """
-        Converts a similarity function into a pairwise similarity function.
-
-        The pairwise similarity function returns the diagonal vector from the similarity matrix, i.e. it only
-        computes the similarity(a[i], b[i]) for each i in the range of the input tensors, rather than
-        computing the similarity between all pairs of a and b.
-
-        Args:
-            similarity_function (Union[str, SimilarityFunction]): The name or enum value of the similarity function.
-
-        Returns:
-            Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: The pairwise similarity function.
-
-        Raises:
-            ValueError: If the provided similarity function is not supported.
-
-        Example:
-            >>> pairwise_fn = SimilarityFunction.to_similarity_pairwise_fn("cosine")
-            >>> similarity_scores = pairwise_fn(embeddings1, embeddings2)
-            >>> similarity_scores
-            tensor([0.3952, 0.1570])
-        """
-        similarity_function = SimilarityFunction(similarity_function)
-
-        if similarity_function == SimilarityFunction.COSINE:
-            return pairwise_cos_sim
-        if similarity_function == SimilarityFunction.DOT_PRODUCT:
-            return pairwise_dot_score
-        if similarity_function == SimilarityFunction.MANHATTAN:
-            return pairwise_manhattan_sim
-        if similarity_function == SimilarityFunction.EUCLIDEAN:
-            return pairwise_euclidean_sim
-
-        raise ValueError(
-            f"The provided function {similarity_function} is not supported. Use one of the supported values: {SimilarityFunction.possible_values()}."
-        )
-
-    @staticmethod
-    def possible_values() -> list[str]:
-        """
-        Returns a list of possible values for the SimilarityFunction enum.
-
-        Returns:
-            list: A list of possible values for the SimilarityFunction enum.
-
-        Example:
-            >>> possible_values = SimilarityFunction.possible_values()
-            >>> possible_values
-            ['cosine', 'dot', 'euclidean', 'manhattan']
-        """
-        return [m.value for m in SimilarityFunction]
diff --git a/mindnlp/sentence/util.py b/mindnlp/sentence/util.py
deleted file mode 100644
index 9897c61fb..000000000
--- a/mindnlp/sentence/util.py
+++ /dev/null
@@ -1,896 +0,0 @@
-"""sentence util"""
-from __future__ import annotations
-
-import functools
-import heapq
-import importlib
-import logging
-import os
-import queue
-from contextlib import contextmanager
-from importlib.metadata import PackageNotFoundError, metadata
-from typing import Callable, overload
-
-import numpy as np
-from tqdm.autonotebook import tqdm
-from huggingface_hub import hf_hub_download, snapshot_download
-
-import mindspore
-from mindspore import Tensor
-from mindnlp.core import ops, nn
-
-logger = logging.getLogger(__name__)
-
-
-def _convert_to_tensor(a: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Converts the input `a` to a PyTorch tensor if it is not already a tensor.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The input array or tensor.
-
-    Returns:
-        Tensor: The converted tensor.
-    """
-    if not isinstance(a, Tensor):
-        a = mindspore.tensor(a)
-    return a
-
-
-def _convert_to_batch(a: Tensor) -> Tensor:
-    """
-    If the tensor `a` is 1-dimensional, it is unsqueezed to add a batch dimension.
-
-    Args:
-        a (Tensor): The input tensor.
-
-    Returns:
-        Tensor: The tensor with a batch dimension.
-    """
-    if a.dim() == 1:
-        a = a.unsqueeze(0)
-    return a
-
-
-def _convert_to_batch_tensor(a: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Converts the input data to a tensor with a batch dimension.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The input data to be converted.
-
-    Returns:
-        Tensor: The converted tensor with a batch dimension.
-    """
-    a = _convert_to_tensor(a)
-    a = _convert_to_batch(a)
-    return a
-
-
-def pytorch_cos_sim(a: Tensor, b: Tensor) -> Tensor:
-    """
-    Computes the cosine similarity between two tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Matrix with res[i][j] = cos_sim(a[i], b[j])
-    """
-    return cos_sim(a, b)
-
-
-def cos_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Computes the cosine similarity between two tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Matrix with res[i][j] = cos_sim(a[i], b[j])
-    """
-    a = _convert_to_batch_tensor(a)
-    b = _convert_to_batch_tensor(b)
-
-    a_norm = normalize_embeddings(a)
-    b_norm = normalize_embeddings(b)
-    return ops.mm(a_norm, b_norm.transpose(0, 1))
-
-
-def pairwise_cos_sim(a: Tensor, b: Tensor) -> Tensor:
-    """
-    Computes the pairwise cosine similarity cos_sim(a[i], b[i]).
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Vector with res[i] = cos_sim(a[i], b[i])
-    """
-    a = _convert_to_tensor(a)
-    b = _convert_to_tensor(b)
-
-    return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b))
-
-
-def dot_score(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Computes the dot-product dot_prod(a[i], b[j]) for all i and j.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Matrix with res[i][j] = dot_prod(a[i], b[j])
-    """
-    a = _convert_to_batch_tensor(a)
-    b = _convert_to_batch_tensor(b)
-
-    return ops.mm(a, b.transpose(0, 1))
-
-
-def pairwise_dot_score(a: Tensor, b: Tensor) -> Tensor:
-    """
-    Computes the pairwise dot-product dot_prod(a[i], b[i]).
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Vector with res[i] = dot_prod(a[i], b[i])
-    """
-    a = _convert_to_tensor(a)
-    b = _convert_to_tensor(b)
-
-    return (a * b).sum(dim=-1)
-
-
-def manhattan_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Computes the manhattan similarity (i.e., negative distance) between two tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Matrix with res[i][j] = -manhattan_distance(a[i], b[j])
-    """
-    a = _convert_to_batch_tensor(a)
-    b = _convert_to_batch_tensor(b)
-
-    return -ops.cdist(a, b, p=1.0)
-
-
-def pairwise_manhattan_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor):
-    """
-    Computes the manhattan similarity (i.e., negative distance) between pairs of tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Vector with res[i] = -manhattan_distance(a[i], b[i])
-    """
-    a = _convert_to_tensor(a)
-    b = _convert_to_tensor(b)
-
-    return -ops.sum(ops.abs(a - b), dim=-1)
-
-
-def euclidean_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor:
-    """
-    Computes the euclidean similarity (i.e., negative distance) between two tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Matrix with res[i][j] = -euclidean_distance(a[i], b[j])
-    """
-    a = _convert_to_batch_tensor(a)
-    b = _convert_to_batch_tensor(b)
-
-    return -ops.cdist(a, b, p=2.0)
-
-
-def pairwise_euclidean_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor):
-    """
-    Computes the euclidean distance (i.e., negative distance) between pairs of tensors.
-
-    Args:
-        a (Union[list, np.ndarray, Tensor]): The first tensor.
-        b (Union[list, np.ndarray, Tensor]): The second tensor.
-
-    Returns:
-        Tensor: Vector with res[i] = -euclidean_distance(a[i], b[i])
-    """
-    a = _convert_to_tensor(a)
-    b = _convert_to_tensor(b)
-
-    return -ops.sqrt(ops.sum((a - b) ** 2, dim=-1))
-
-
-def pairwise_angle_sim(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Computes the absolute normalized angle distance. See :class:`~sentence_transformers.losses.AnglELoss`
-    or https://arxiv.org/abs/2309.12871v1 for more information.
-
-    Args:
-        x (Tensor): The first tensor.
-        y (Tensor): The second tensor.
-
-    Returns:
-        Tensor: Vector with res[i] = angle_sim(a[i], b[i])
-    """
-
-    x = _convert_to_tensor(x)
-    y = _convert_to_tensor(y)
-
-    # modified from https://github.com/SeanLee97/AnglE/blob/main/angle_emb/angle.py
-    # chunk both tensors to obtain complex components
-    a, b = ops.chunk(x, 2, dim=1)
-    c, d = ops.chunk(y, 2, dim=1)
-
-    z = ops.sum(c**2 + d**2, dim=1, keepdim=True)
-    re = (a * c + b * d) / z
-    im = (b * c - a * d) / z
-
-    dz = ops.sum(a**2 + b**2, dim=1, keepdim=True) ** 0.5
-    dw = ops.sum(c**2 + d**2, dim=1, keepdim=True) ** 0.5
-    re /= dz / dw
-    im /= dz / dw
-
-    norm_angle = ops.sum(ops.concat((re, im), dim=1), dim=1)
-    return ops.abs(norm_angle)
-
-
-def normalize_embeddings(embeddings: Tensor) -> Tensor:
-    """
-    Normalizes the embeddings matrix, so that each sentence embedding has unit length.
-
-    Args:
-        embeddings (Tensor): The input embeddings matrix.
-
-    Returns:
-        Tensor: The normalized embeddings matrix.
-    """
-    return nn.functional.normalize(embeddings, p=2, dim=1)
-
-
-@overload
-def truncate_embeddings(embeddings: np.ndarray, truncate_dim: int | None) -> np.ndarray: ...
-
-
-@overload
-def truncate_embeddings(embeddings: mindspore.Tensor, truncate_dim: int | None) -> mindspore.Tensor: ...
-
-
-def truncate_embeddings(embeddings: np.ndarray | mindspore.Tensor, truncate_dim: int | None) -> np.ndarray | mindspore.Tensor:
-    """
-    Truncates the embeddings matrix.
-
-    Args:
-        embeddings (Union[np.ndarray, mindspore.Tensor]): Embeddings to truncate.
-        truncate_dim (Optional[int]): The dimension to truncate sentence embeddings to. `None` does no truncation.
-
-    Example:
-        >>> from sentence_transformers import SentenceTransformer
-        >>> from sentence_transformers.util import truncate_embeddings
-        >>> model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")
-        >>> embeddings = model.encode(["It's so nice outside!", "Today is a beautiful day.", "He drove to work earlier"])
-        >>> embeddings.shape
-        (3, 768)
-        >>> model.similarity(embeddings, embeddings)
-        tensor([[1.0000, 0.8100, 0.1426],
-                [0.8100, 1.0000, 0.2121],
-                [0.1426, 0.2121, 1.0000]])
-        >>> truncated_embeddings = truncate_embeddings(embeddings, 128)
-        >>> truncated_embeddings.shape
-        >>> model.similarity(truncated_embeddings, truncated_embeddings)
-        tensor([[1.0000, 0.8092, 0.1987],
-                [0.8092, 1.0000, 0.2716],
-                [0.1987, 0.2716, 1.0000]])
-
-    Returns:
-        Union[np.ndarray, mindspore.Tensor]: Truncated embeddings.
-    """
-    return embeddings[..., :truncate_dim]
-
-
-def paraphrase_mining(
-    model,
-    sentences: list[str],
-    show_progress_bar: bool = False,
-    batch_size: int = 32,
-    query_chunk_size: int = 5000,
-    corpus_chunk_size: int = 100000,
-    max_pairs: int = 500000,
-    top_k: int = 100,
-    score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
-) -> list[list[float | int]]:
-    """
-    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
-    other sentences and returns a list with the pairs that have the highest cosine similarity score.
-
-    Args:
-        model (SentenceTransformer): SentenceTransformer model for embedding computation
-        sentences (List[str]): A list of strings (texts or sentences)
-        show_progress_bar (bool, optional): Plotting of a progress bar. Defaults to False.
-        batch_size (int, optional): Number of texts that are encoded simultaneously by the model. Defaults to 32.
-        query_chunk_size (int, optional): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time). Defaults to 5000.
-        corpus_chunk_size (int, optional): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time). Defaults to 100000.
-        max_pairs (int, optional): Maximal number of text pairs returned. Defaults to 500000.
-        top_k (int, optional): For each sentence, we retrieve up to top_k other sentences. Defaults to 100.
-        score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity. Defaults to cos_sim.
-
-    Returns:
-        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
-    """
-
-    # Compute embedding for the sentences
-    embeddings = model.encode(
-        sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True
-    )
-
-    return paraphrase_mining_embeddings(
-        embeddings,
-        query_chunk_size=query_chunk_size,
-        corpus_chunk_size=corpus_chunk_size,
-        max_pairs=max_pairs,
-        top_k=top_k,
-        score_function=score_function,
-    )
-
-
-def paraphrase_mining_embeddings(
-    embeddings: Tensor,
-    query_chunk_size: int = 5000,
-    corpus_chunk_size: int = 100000,
-    max_pairs: int = 500000,
-    top_k: int = 100,
-    score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
-) -> list[list[float | int]]:
-    """
-    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
-    other sentences and returns a list with the pairs that have the highest cosine similarity score.
-
-    Args:
-        embeddings (Tensor): A tensor with the embeddings
-        query_chunk_size (int): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
-        corpus_chunk_size (int): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
-        max_pairs (int): Maximal number of text pairs returned.
-        top_k (int): For each sentence, we retrieve up to top_k other sentences
-        score_function (Callable[[Tensor, Tensor], Tensor]): Function for computing scores. By default, cosine similarity.
-
-    Returns:
-        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
-    """
-
-    top_k += 1  # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs
-
-    # Mine for duplicates
-    pairs = queue.PriorityQueue()
-    min_score = -1
-    num_added = 0
-
-    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
-        for query_start_idx in range(0, len(embeddings), query_chunk_size):
-            scores = score_function(
-                embeddings[query_start_idx : query_start_idx + query_chunk_size],
-                embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
-            )
-
-            scores_top_k_values, scores_top_k_idx = ops.topk(
-                scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False
-            )
-            scores_top_k_values = scores_top_k_values.cpu().tolist()
-            scores_top_k_idx = scores_top_k_idx.cpu().tolist()
-
-            for query_itr in range(len(scores)):
-                for top_k_idx, corpus_itr in enumerate(scores_top_k_idx[query_itr]):
-                    i = query_start_idx + query_itr
-                    j = corpus_start_idx + corpus_itr
-
-                    if i != j and scores_top_k_values[query_itr][top_k_idx] > min_score:
-                        pairs.put((scores_top_k_values[query_itr][top_k_idx], i, j))
-                        num_added += 1
-
-                        if num_added >= max_pairs:
-                            entry = pairs.get()
-                            min_score = entry[0]
-
-    # Get the pairs
-    added_pairs = set()  # Used for duplicate detection
-    pairs_list = []
-    while not pairs.empty():
-        score, i, j = pairs.get()
-        sorted_i, sorted_j = sorted([i, j])
-
-        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
-            added_pairs.add((sorted_i, sorted_j))
-            pairs_list.append([score, sorted_i, sorted_j])
-
-    # Highest scores first
-    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
-    return pairs_list
-
-
-def information_retrieval(*args, **kwargs) -> list[list[dict[str, int | float]]]:
-    """This function is deprecated. Use semantic_search instead"""
-    return semantic_search(*args, **kwargs)
-
-
-def semantic_search(
-    query_embeddings: Tensor,
-    corpus_embeddings: Tensor,
-    query_chunk_size: int = 100,
-    corpus_chunk_size: int = 500000,
-    top_k: int = 10,
-    score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
-) -> list[list[dict[str, int | float]]]:
-    """
-    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
-    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.
-
-    Args:
-        query_embeddings (Tensor): A 2 dimensional tensor with the query embeddings.
-        corpus_embeddings (Tensor): A 2 dimensional tensor with the corpus embeddings.
-        query_chunk_size (int, optional): Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. Defaults to 100.
-        corpus_chunk_size (int, optional): Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. Defaults to 500000.
-        top_k (int, optional): Retrieve top k matching entries. Defaults to 10.
-        score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity.
-
-    Returns:
-        List[List[Dict[str, Union[int, float]]]]: A list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
-    """
-
-    if isinstance(query_embeddings, (np.ndarray, np.generic)):
-        query_embeddings = ops.from_numpy(query_embeddings)
-    elif isinstance(query_embeddings, list):
-        query_embeddings = ops.stack(query_embeddings)
-
-    if len(query_embeddings.shape) == 1:
-        query_embeddings = query_embeddings.unsqueeze(0)
-
-    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
-        corpus_embeddings = ops.from_numpy(corpus_embeddings)
-    elif isinstance(corpus_embeddings, list):
-        corpus_embeddings = ops.stack(corpus_embeddings)
-
-    queries_result_list = [[] for _ in range(len(query_embeddings))]
-
-    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
-        # Iterate over chunks of the corpus
-        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
-            # Compute cosine similarities
-            cos_scores = score_function(
-                query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
-                corpus_embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
-            )
-
-            # Get top-k scores
-            cos_scores_top_k_values, cos_scores_top_k_idx = ops.topk(
-                cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False
-            )
-            cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
-            cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
-
-            for query_itr in range(len(cos_scores)):
-                for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
-                    corpus_id = corpus_start_idx + sub_corpus_id
-                    query_id = query_start_idx + query_itr
-                    if len(queries_result_list[query_id]) < top_k:
-                        heapq.heappush(
-                            queries_result_list[query_id], (score, corpus_id)
-                        )  # heaqp tracks the quantity of the first element in the tuple
-                    else:
-                        heapq.heappushpop(queries_result_list[query_id], (score, corpus_id))
-
-    # change the data format and sort
-    for query_id in range(len(queries_result_list)):
-        for doc_itr in range(len(queries_result_list[query_id])):
-            score, corpus_id = queries_result_list[query_id][doc_itr]
-            queries_result_list[query_id][doc_itr] = {"corpus_id": corpus_id, "score": score}
-        queries_result_list[query_id] = sorted(queries_result_list[query_id], key=lambda x: x["score"], reverse=True)
-
-    return queries_result_list
-
-
-def fullname(o) -> str:
-    """
-    Gives a full name (package_name.class_name) for a class / object in Python. Will
-    be used to load the correct classes from JSON files
-
-    Args:
-        o: The object for which to get the full name.
-
-    Returns:
-        str: The full name of the object.
-
-    Example:
-        >>> from sentence_transformers.losses import MultipleNegativesRankingLoss
-        >>> from sentence_transformers import SentenceTransformer
-        >>> from sentence_transformers.util import fullname
-        >>> model = SentenceTransformer('all-MiniLM-L6-v2')
-        >>> loss = MultipleNegativesRankingLoss(model)
-        >>> fullname(loss)
-        'sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss'
-    """
-
-    module = o.__class__.__module__
-    if module is None or module == str.__class__.__module__:
-        return o.__class__.__name__  # Avoid reporting __builtin__
-    else:
-        return module + "." + o.__class__.__name__
-
-
-def import_from_string(dotted_path: str) -> type:
-    """
-    Import a dotted module path and return the attribute/class designated by the
-    last name in the path. Raise ImportError if the import failed.
-
-    Args:
-        dotted_path (str): The dotted module path.
-
-    Returns:
-        Any: The attribute/class designated by the last name in the path.
-
-    Raises:
-        ImportError: If the import failed.
-
-    Example:
-        >>> import_from_string('sentence_transformers.losses.MultipleNegativesRankingLoss')
-        <class 'sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss'>
-    """
-    if 'sentence_transformers' in dotted_path:
-        dotted_path = dotted_path.replace('sentence_transformers', 'mindnlp.sentence')
-
-    if 'torch.nn' in dotted_path:
-        dotted_path = dotted_path.replace('torch.nn', 'mindnlp.core.nn')
-
-    try:
-        module_path, class_name = dotted_path.rsplit(".", 1)
-    except ValueError:
-        msg = f"{dotted_path} doesn't look like a module path"
-        raise ImportError(msg)
-
-    try:
-        module = importlib.import_module(dotted_path)
-    except Exception:
-        module = importlib.import_module(module_path)
-
-    try:
-        return getattr(module, class_name)
-    except AttributeError:
-        msg = f'Module "{module_path}" does not define a "{class_name}" attribute/class'
-        raise ImportError(msg)
-
-
-def community_detection(
-    embeddings: mindspore.Tensor | np.ndarray,
-    threshold: float = 0.75,
-    min_community_size: int = 10,
-    batch_size: int = 1024,
-    show_progress_bar: bool = False,
-) -> list[list[int]]:
-    """
-    Function for Fast Community Detection.
-
-    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
-    Returns only communities that are larger than min_community_size. The communities are returned
-    in decreasing order. The first element in each list is the central point in the community.
-
-    Args:
-        embeddings (mindspore.Tensor or numpy.ndarray): The input embeddings.
-        threshold (float): The threshold for determining if two embeddings are close. Defaults to 0.75.
-        min_community_size (int): The minimum size of a community to be considered. Defaults to 10.
-        batch_size (int): The batch size for computing cosine similarity scores. Defaults to 1024.
-        show_progress_bar (bool): Whether to show a progress bar during computation. Defaults to False.
-
-    Returns:
-        List[List[int]]: A list of communities, where each community is represented as a list of indices.
-    """
-    if not isinstance(embeddings, mindspore.Tensor):
-        embeddings = mindspore.tensor(embeddings)
-
-    threshold = mindspore.tensor(threshold)
-    embeddings = normalize_embeddings(embeddings)
-
-    extracted_communities = []
-
-    # Maximum size for community
-    min_community_size = min(min_community_size, len(embeddings))
-    sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))
-
-    for start_idx in tqdm(
-        range(0, len(embeddings), batch_size), desc="Finding clusters", disable=not show_progress_bar
-    ):
-        # Compute cosine similarity scores
-        cos_scores = embeddings[start_idx : start_idx + batch_size] @ embeddings.T
-
-        # Threshold the cos scores and determine how many close embeddings exist per embedding
-        threshold_mask = cos_scores >= threshold
-        row_wise_count = threshold_mask.sum(1)
-
-        # Only consider embeddings with enough close other embeddings
-        large_enough_mask = row_wise_count >= min_community_size
-        if not large_enough_mask.any():
-            continue
-
-        row_wise_count = row_wise_count[large_enough_mask]
-        cos_scores = cos_scores[large_enough_mask]
-
-        # The max is the largest potential community, so we use that in topk
-        k = row_wise_count.max()
-        _, top_k_indices = cos_scores.topk(k=k, largest=True)
-
-        # Use the row-wise count to slice the indices
-        for count, indices in zip(row_wise_count, top_k_indices):
-            extracted_communities.append(indices[:count].tolist())
-
-    # Largest cluster first
-    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)
-
-    # Step 2) Remove overlapping communities
-    unique_communities = []
-    extracted_ids = set()
-
-    for cluster_id, community in enumerate(extracted_communities):
-        non_overlapped_community = []
-        for idx in community:
-            if idx not in extracted_ids:
-                non_overlapped_community.append(idx)
-
-        if len(non_overlapped_community) >= min_community_size:
-            unique_communities.append(non_overlapped_community)
-            extracted_ids.update(non_overlapped_community)
-
-    unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)
-
-    return unique_communities
-
-
-##################
-#
-######################
-
-
-class disabled_tqdm(tqdm):
-    """
-    Class to override `disable` argument in case progress bars are globally disabled.
-
-    Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324.
-    """
-
-    def __init__(self, *args, **kwargs):
-        kwargs["disable"] = True
-        super().__init__(*args, **kwargs)
-
-    def __delattr__(self, attr: str) -> None:
-        """Fix for https://github.com/huggingface/huggingface_hub/issues/1603"""
-        try:
-            super().__delattr__(attr)
-        except AttributeError:
-            if attr != "_lock":
-                raise
-
-
-@contextmanager
-def disable_logging(highest_level=logging.CRITICAL):
-    """
-    A context manager that will prevent any logging messages
-    triggered during the body from being processed.
-
-    Args:
-        highest_level: the maximum logging level allowed.
-    """
-
-    previous_level = logging.root.manager.disable
-
-    logging.disable(highest_level)
-
-    try:
-        yield
-    finally:
-        logging.disable(previous_level)
-
-
-def is_sentence_transformer_model(
-    model_name_or_path: str,
-    token: bool | str | None = None,
-    cache_folder: str | None = None,
-    revision: str | None = None,
-    local_files_only: bool = False,
-) -> bool:
-    """
-    Checks if the given model name or path corresponds to a SentenceTransformer model.
-
-    Args:
-        model_name_or_path (str): The name or path of the model.
-        token (Optional[Union[bool, str]]): The token to be used for authentication. Defaults to None.
-        cache_folder (Optional[str]): The folder to cache the model files. Defaults to None.
-        revision (Optional[str]): The revision of the model. Defaults to None.
-        local_files_only (bool): Whether to only use local files for the model. Defaults to False.
-
-    Returns:
-        bool: True if the model is a SentenceTransformer model, False otherwise.
-    """
-    return bool(
-        load_file_path(
-            model_name_or_path,
-            "modules.json",
-            token=token,
-            cache_folder=cache_folder,
-            revision=revision,
-            local_files_only=local_files_only,
-        )
-    )
-
-
-def load_file_path(
-    model_name_or_path: str,
-    filename: str,
-    token: bool | str | None = None,
-    cache_folder: str | None = None,
-    revision: str | None = None,
-    local_files_only: bool = False,
-) -> str | None:
-    """
-    Loads a file from a local or remote location.
-
-    Args:
-        model_name_or_path (str): The model name or path.
-        filename (str): The name of the file to load.
-        token (Optional[Union[bool, str]]): The token to access the remote file (if applicable).
-        cache_folder (Optional[str]): The folder to cache the downloaded file (if applicable).
-        revision (Optional[str], optional): The revision of the file (if applicable). Defaults to None.
-        local_files_only (bool, optional): Whether to only consider local files. Defaults to False.
-
-    Returns:
-        Optional[str]: The path to the loaded file, or None if the file could not be found or loaded.
-    """
-    # If file is local
-    file_path = os.path.join(model_name_or_path, filename)
-    if os.path.exists(file_path):
-        return file_path
-
-    # If file is remote
-    try:
-        return hf_hub_download(
-            model_name_or_path,
-            filename=filename,
-            revision=revision,
-            library_name="sentence-transformers",
-            token=token,
-            cache_dir=cache_folder,
-            local_files_only=local_files_only,
-        )
-    except Exception:
-        return None
-
-
-def load_dir_path(
-    model_name_or_path: str,
-    directory: str,
-    token: bool | str | None = None,
-    cache_folder: str | None = None,
-    revision: str | None = None,
-    local_files_only: bool = False,
-) -> str | None:
-    """
-    Loads the directory path for a given model name or path.
-
-    Args:
-        model_name_or_path (str): The name or path of the model.
-        directory (str): The directory to load.
-        token (Optional[Union[bool, str]]): The token for authentication.
-        cache_folder (Optional[str]): The folder to cache the downloaded files.
-        revision (Optional[str], optional): The revision of the model. Defaults to None.
-        local_files_only (bool, optional): Whether to only use local files. Defaults to False.
-
-    Returns:
-        Optional[str]: The directory path if it exists, otherwise None.
-    """
-    # If file is local
-    dir_path = os.path.join(model_name_or_path, directory)
-    if os.path.exists(dir_path):
-        return dir_path
-
-    download_kwargs = {
-        "repo_id": model_name_or_path,
-        "revision": revision,
-        "allow_patterns": f"{directory}/**",
-        "library_name": "sentence-transformers",
-        "token": token,
-        "cache_dir": cache_folder,
-        "local_files_only": local_files_only,
-        "tqdm_class": disabled_tqdm,
-    }
-    # Try to download from the remote
-    try:
-        repo_path = snapshot_download(**download_kwargs)
-    except Exception:
-        # Otherwise, try local (i.e. cache) only
-        download_kwargs["local_files_only"] = True
-        repo_path = snapshot_download(**download_kwargs)
-    return os.path.join(repo_path, directory)
-
-
-def save_to_hub_args_decorator(func):
-    @functools.wraps(func)
-    def wrapper(self, *args, **kwargs):
-        # If repo_id not already set, use repo_name
-        repo_name = kwargs.pop("repo_name", None)
-        if repo_name and "repo_id" not in kwargs:
-            logger.warning(
-                "Providing a `repo_name` keyword argument to `save_to_hub` is deprecated, please use `repo_id` instead."
-            )
-            kwargs["repo_id"] = repo_name
-
-        # If positional args are used, adjust for the new "token" keyword argument
-        if len(args) >= 2:
-            args = (*args[:2], None, *args[2:])
-
-        return func(self, *args, **kwargs)
-
-    return wrapper
-
-
-def check_package_availability(package_name: str, owner: str) -> bool:
-    """
-    Checks if a package is available from the correct owner.
-    """
-    try:
-        meta = metadata(package_name)
-        return meta["Name"] == package_name and owner in meta["Home-page"]
-    except PackageNotFoundError:
-        return False
-
-
-def is_accelerate_available() -> bool:
-    """
-    Returns True if the Huggingface accelerate library is available.
-    """
-    return check_package_availability("accelerate", "huggingface")
-
-
-def is_datasets_available() -> bool:
-    """
-    Returns True if the Huggingface datasets library is available.
-    """
-    return check_package_availability("datasets", "huggingface")
-
-
-def is_training_available() -> bool:
-    """
-    Returns True if we have the required dependencies for training Sentence
-    Transformers models, i.e. Huggingface datasets and Huggingface accelerate.
-    """
-    return is_accelerate_available() and is_datasets_available()
-
-
-@contextmanager
-def disable_datasets_caching():
-    """
-    A context manager that will disable caching in the datasets library.
-    """
-    from datasets import disable_caching, enable_caching, is_caching_enabled
-
-    is_originally_enabled = is_caching_enabled()
-
-    try:
-        if is_originally_enabled:
-            disable_caching()
-        yield
-    finally:
-        if is_originally_enabled:
-            enable_caching()
diff --git a/mindnlp/trl/__init__.py b/mindnlp/trl/__init__.py
deleted file mode 100644
index 8a4e8608e..000000000
--- a/mindnlp/trl/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MindNLP Transformer Reinforcement Learning."""
-
-from .trainer import (
-    DPOTrainer,
-    DPOConfig,
-    FDivergenceType,
-    _build_tokenized_answer,
-    _truncate_tokens
-)
diff --git a/mindnlp/trl/core.py b/mindnlp/trl/core.py
deleted file mode 100644
index ab5a028f9..000000000
--- a/mindnlp/trl/core.py
+++ /dev/null
@@ -1,332 +0,0 @@
-"""
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-
-from typing import Dict, List, Optional, Tuple, Union
-from collections.abc import Mapping
-
-import numpy as np
-import mindspore as ms
-
-from mindspore import ops
-from mindspore import Tensor
-
-#与huggingface.transformers同路程
-from mindnlp.transformers.generation import TopKLogitsWarper, TopPLogitsWarper
-
-#暂时只更改import_utils中的这两个函数
-#from import_utils import is_npu_available, is_xpu_available
-
-#如果遇到需要padding的时候，补充的数为-1
-WANDB_PADDING = -1
-
-
-def top_k_top_p_filtering(
-    logits: Tensor,
-    top_k: int = 0,
-    top_p: float = 1.0,
-    filter_value: float = -float("Inf"),
-    min_tokens_to_keep: int = 1,
-) -> Tensor:
-    """
-    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
-
-    Args:
-        logits: logits distribution shape (batch size, vocabulary size)
-        top_k (`int`, *optional*, defaults to 0):
-            If > 0, only keep the top k tokens with highest probability (top-k filtering)
-        top_p (`float`, *optional*, defaults to 1.0):
-            If < 1.0, only keep the top tokens 
-            with cumulative probability >= top_p (nucleus filtering). Nucleus
-            filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimumber of tokens we keep per batch example in the output.
-
-    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-
-    if top_k > 0:
-        logits = TopKLogitsWarper(top_k=top_k,
-                                  filter_value=filter_value,
-                                  min_tokens_to_keep=min_tokens_to_keep)(None, logits)
-
-    if 0 <= top_p <= 1.0:
-        logits = TopPLogitsWarper(top_p=top_p,
-                                  filter_value=filter_value,
-                                  min_tokens_to_keep=min_tokens_to_keep)(None, logits)
-
-    return logits
-
-
-def flatten_dict(nested: Dict, sep: str = "/") -> Dict:
-    """Flatten dictionary and concatenate nested keys with separator."""
-
-    def recurse(nest: Dict, prefix: str, into: Dict) -> None:
-        for k, v in nest.items():
-            if sep in k:
-                raise ValueError(f"separator '{sep}' not allowed to be in key '{k}'")
-            if isinstance(v, Mapping):
-                recurse(v, prefix + k + sep, into)
-            else:
-                into[prefix + k] = v
-
-    flat = {}
-    recurse(nested, "", flat)
-    return flat
-
-
-def convert_to_scalar(stats: Dict) -> Dict:
-    """
-    Converts the stats from a flattened dict to single scalar dicts
-    """
-    tensorboard_stats = {}
-    for k, v in stats.items():
-        # for tensorboard compatibility - arrays and tensors are ignored with tensorboard
-        # therefore we convert single element tensors to scalars
-        if isinstance(v, (Tensor,np.ndarray)) and (
-            len(v.shape) == 0 or (len(v.shape) == 1 and v.shape[0] == 1)
-        ):
-            v = v.item()
-        tensorboard_stats[k] = v
-    return tensorboard_stats
-
-
-def pad_sequence(sequences, padding_value=0):
-    """
-    Padding a set of sequences to make all sequences the same length
-    """
-    # Find the maximum length of the sequences
-    max_len = max(seq.shape[0] for seq in sequences)
-    padded_seqs = []
-    for seq in sequences:
-        # Calculate the padding needed
-        pad_len = max_len - seq.shape[0]
-        # Pad the sequence
-        # pad_seq = ops.Pad(((0, pad_len),),)(seq)
-        pad_seq = ops.pad(seq, (0, pad_len), mode='constant', value=padding_value)
-        padded_seqs.append(pad_seq)
-    return ops.stack(padded_seqs)
-
-def stack_dicts(stats_dicts):
-    """Stack the values of a dict."""
-    results = {}
-    for k in stats_dicts[0]:
-        stats_list = [ops.reshape(d[k], (-1,)) for d in stats_dicts]
-        results[k] = pad_sequence(stats_list, padding_value=WANDB_PADDING)
-    return results
-
-
-def add_suffix(input_dict: Dict, suffix: str) -> Dict:
-    """Add suffix to dict keys."""
-    return {k + suffix: v for k, v in input_dict.items()}
-
-
-def pad_to_size(tensor: Tensor, size: int, dim: int = 1, padding: int = 50256) -> Tensor:
-    """Pad tensor to size."""
-    t_size = tensor.shape[dim]
-    if t_size == size:
-        return tensor
-    return ops.pad(tensor, (0, size - t_size), "constant", padding)
-
-def whiten(values: Tensor, shift_mean: bool = True) -> Tensor:
-    """Whiten values."""
-    mean, var = ops.mean(values), ops.var(values)
-    whitened = (values - mean) * ops.rsqrt(var + 1e-8)
-    if not shift_mean:
-        whitened += mean
-    return whitened
-
-def masked_mean(values: Tensor,
-                mask: Tensor,
-                axis: Optional[bool] = None) -> Tensor:
-    """Compute mean of tensor with a masked values."""
-    if axis is not None:
-        return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
-    return (values * mask).sum() / mask.sum()
-
-def masked_var(values: Tensor,
-               mask: Tensor,
-               unbiased: bool = True,
-               axis: Optional[bool] = None) -> Tensor:
-    """Compute variance of tensor with masked values."""
-    mean = masked_mean(values, mask, axis = axis)
-    centered_values = values - mean
-    variance = masked_mean(centered_values**2, mask)
-    if unbiased:
-        mask_sum = mask.sum()
-        if mask_sum == 0:
-            raise ValueError(
-                "The sum of the mask is zero, which can happen when `mini_batch_size=1`;"
-                "try increase the `mini_batch_size` or `gradient_accumulation_steps`"
-            )
-        # note that if mask_sum == 1, then there is a division by zero issue
-        # to avoid it you just need to use a larger minibatch_size
-        bessel_correction = mask_sum / (mask_sum - 1)
-        variance = variance * bessel_correction
-    return variance
-
-def masked_whiten(values: Tensor, mask: Tensor, shift_mean: bool = True) -> Tensor:
-    """Whiten values with masked values."""
-    mean, var = masked_mean(values, mask), masked_var(values, mask)
-    #rsqrt逐元素计算输入Tensor元素的平方根倒数。
-    whitened = (values - mean) * ops.rsqrt(var + 1e-8)
-    if not shift_mean:
-        whitened += mean
-    return whitened
-
-def average_torch_dicts(list_of_dicts: List[Dict]) -> Dict:
-    """Average values of a list of dicts with torch tensors."""
-    average_dict = {}
-    stack = ops.Stack()
-    rmean = ops.ReduceMean()
-    for key in list_of_dicts[0].keys():
-        average_dict[key] = rmean(stack([d[key] for d in list_of_dicts]), axis=0)
-    return average_dict
-
-def entropy_from_logits(logits: Tensor) -> Tensor:
-    """Calculate entropy from logits."""
-    pd = ops.Softmax(axis=-1)(logits)
-    entropy = ops.logsumexp(logits, -1) - ops.ReduceSum()(pd * logits, -1)
-    return entropy
-
-def clip_by_value(x: Tensor, tensor_min: float, tensor_max: float) -> Tensor:
-    """
-    Tensor extension to torch.clamp
-    https://github.com/pytorch/pytorch/issues/2793#issuecomment-428784713
-    """
-    clipped = ops.maximum(
-        ops.minimum(x, Tensor(tensor_max, dtype=x.dtype)),
-        Tensor(tensor_min, dtype=x.dtype)
-        )
-    return clipped
-
-def stats_to_np(stats_dict: Dict) -> Dict:
-    """Cast all torch.tensors in dict to numpy arrays."""
-    new_dict = {}
-    for k, v in stats_dict.items():
-        if isinstance(v, Tensor):
-            new_dict[k] = v.numpy()
-        elif isinstance(v, (int, float)):
-            new_dict[k] = float(v)
-        else:
-            new_dict[k] = v
-    return new_dict
-
-def set_seed(seed: int) -> None:
-    """
-    Helper function for reproducible behavior to 
-    set the seed in `random`, `numpy`, and `mindspore`.
-
-    Args:
-        seed (`int`): The seed to set.
-    """
-    np.random.seed(seed)
-    ms.set_seed(seed)
-
-def randn_tensor(
-    shape: Union[Tuple, List],
-    generator: Optional[Union[List[np.random.Generator],
-                              np.random.Generator]] = None,
-) -> Tensor:
-    """A helper function to create random tensors
-    on the desired `device` with the desired `dtype`. When
-    passing a list of generators,
-    you can seed each batch size individually.
-    If CPU generators are passed, the tensor
-    is always created on the CPU.
-    """
-    # device on which tensor is created defaults to device
-    batch_size = shape[0]
-
-    # make sure generator list of length 1 is treated like a non-list
-    if isinstance(generator, list) and len(generator) == 1:
-        generator = generator[0]
-
-    if isinstance(generator, list):
-        shape = (1,) + shape[1:]
-        latents = [
-            ops.standard_normal(shape, seed=generator[i])
-            for i in range(batch_size)
-        ]
-        latents = ops.concat(latents, axis=0)
-    else:
-        latents = ops.standard_normal(shape, seed=generator)
-
-    return latents
-
-class LengthSampler:
-    """
-    Samples a length
-    """
-
-    def __init__(self, min_value: int, max_value: int):
-        self.values = list(range(min_value, max_value))
-
-    def __call__(self) -> int:
-        return np.random.choice(self.values)
-
-
-
-
-#######################################################
-
-# def respond_to_batch(
-#     model: nn.Module, queries: List[torch.LongTensor],
-# txt_len: int = 20, top_k: int = 0, top_p: float = 1.0
-# ) -> torch.LongTensor:
-#     """Sample text from language model."""
-#     input_ids = queries
-#     for _i in range(txt_len):
-#         # Get Logits
-#         outputs = model(input_ids)
-#         next_token_logits = outputs[0][:, -1, :]
-#         next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-#         # Sample
-#         probs = F.softmax(next_token_logits, dim=-1)
-#         next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-#         input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)
-#     return input_ids[:, -txt_len:]
-
-
-# class PPODecorators:
-#     optimize_device_cache = False
-
-#     @classmethod
-#     @contextmanager
-#     def empty_device_cache(cls):
-#         yield
-#         if cls.optimize_device_cache:
-#             if is_xpu_available():
-#                 gc.collect()
-#                 torch.xpu.empty_cache()
-#                 gc.collect()
-#             elif is_npu_available():
-#                 gc.collect()
-#                 torch.npu.empty_cache()
-#                 gc.collect()
-#             elif torch.cuda.is_available():
-#                 gc.collect()
-#                 torch.cuda.empty_cache()
-#                 gc.collect(
-
-
-
-
-
-
-
-# Usage example:
-# generator = np.random.default_rng(seed=42)
-# randn_tensor((3, 4), generator=generator, device='GPU', dtype=mindspore.float32)
diff --git a/mindnlp/trl/environment/__init__.py b/mindnlp/trl/environment/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/trl/extras/__init__.py b/mindnlp/trl/extras/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/trl/import_utils.py b/mindnlp/trl/import_utils.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/trl/models/__init__.py b/mindnlp/trl/models/__init__.py
deleted file mode 100644
index a58fd95c0..000000000
--- a/mindnlp/trl/models/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""trl model __init__"""
-from .modeling_base import (
-    PreTrainedModelWrapper,
-    create_reference_model
-)
diff --git a/mindnlp/trl/models/modeling_base.py b/mindnlp/trl/models/modeling_base.py
deleted file mode 100644
index 8b6cc0ff1..000000000
--- a/mindnlp/trl/models/modeling_base.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""trl model main class."""
-import logging
-import os
-from copy import deepcopy
-from typing import Optional
-
-import mindspore
-from mindspore import nn
-
-from mindnlp.peft import (
-    PeftConfig,
-    PeftModel,
-    PeftModelForCausalLM,
-    PeftModelForSeq2SeqLM,
-    PromptLearningConfig,
-    get_peft_model,
-)
-
-from ...transformers import PreTrainedModel
-
-
-LAYER_PATTERNS = [
-    "transformer.h.{layer}",
-    "model.decoder.layers.{layer}",
-    "gpt_neox.layers.{layer}",
-    "model.layers.{layer}",
-]
-
-
-class PreTrainedModelWrapper(nn.Cell):
-    r"""
-    A wrapper class around a (`transformers.PreTrainedModel`) to be compatible with the
-    (`~transformers.PreTrained`) class in order to keep some attributes and methods of the
-    (`~transformers.PreTrainedModel`) class.
-
-    Attributes:
-        pretrained_model (`transformers.PreTrainedModel`):
-            The model to be wrapped.
-        parent_class (`transformers.PreTrainedModel`):
-            The parent class of the model to be wrapped.
-        supported_args (`list`):
-            The list of arguments that are supported by the wrapper class.
-    """
-
-    transformers_parent_class = None
-    supported_args = None
-    supported_modules = ("v_head",)
-    supported_rm_modules = ("score",)
-    supported_pretrained_model_architectures = (
-        (PreTrainedModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM)
-    )
-
-    def __init__(
-        self, pretrained_model=None, score_module=None, supports_rm_adapter=False, rm_adapter_name=None, **kwargs
-    ):
-        super().__init__()
-        self.pretrained_model = pretrained_model
-
-        self.config = pretrained_model.config
-        self.prepare_inputs_for_generation = pretrained_model.prepare_inputs_for_generation
-        self.is_loaded_in_8bit = getattr(pretrained_model, "is_loaded_in_8bit", False)
-        self.is_loaded_in_4bit = getattr(pretrained_model, "is_loaded_in_4bit", False)
-        self.is_sequential_parallel = False
-
-        if hasattr(pretrained_model, "gradient_checkpointing_disable"):
-            self.gradient_checkpointing_disable = pretrained_model.gradient_checkpointing_disable
-
-        if hasattr(pretrained_model, "gradient_checkpointing_enable"):
-            self.gradient_checkpointing_enable = pretrained_model.gradient_checkpointing_enable
-
-        if hasattr(pretrained_model, "enable_input_require_grads"):
-            self.enable_input_require_grads = pretrained_model.enable_input_require_grads
-
-        self.supports_rm_adapter = supports_rm_adapter
-        self.rm_adapter_name = rm_adapter_name
-        self.policy_adapter_name = "default"
-        if score_module is not None:
-            self.score = score_module
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""
-        Instantiates a new model from a pretrained model from `transformers`. The
-        pretrained model is loaded using the `from_pretrained` method of the
-        `transformers.PreTrainedModel` class. The arguments that are specific to the
-        `transformers.PreTrainedModel` class are passed along this method and filtered
-        out from the `kwargs` argument.
-
-        Args:
-            pretrained_model_name_or_path (`str` or `transformers.PreTrainedModel`):
-                The path to the pretrained model or its name.
-            *model_args (`list`, *optional*)):
-                Additional positional arguments passed along to the underlying model's
-                `from_pretrained` method.
-            **kwargs (`dict`, *optional*):
-                Additional keyword arguments passed along to the underlying model's
-                `from_pretrained` method. We also pre-process the kwargs to extract
-                the arguments that are specific to the `transformers.PreTrainedModel`
-                class and the arguments that are specific to trl models. The kwargs
-                also support `prepare_model_for_kbit_training` arguments from
-                `peft` library.
-        """
-        if kwargs is not None:
-            peft_config = kwargs.pop("peft_config", None)
-            reward_adapter = kwargs.pop("reward_adapter", None)
-            reward_adapter_name = kwargs.pop("reward_adapter_name", "reward_adapter")
-            is_trainable = kwargs.pop("is_trainable", False)
-            trl_model_args, pretrained_kwargs, peft_quantization_kwargs = cls._split_kwargs(kwargs)
-            token = pretrained_kwargs.get("token", None)
-        else:
-            peft_config = None
-            is_trainable = False
-            trl_model_args = {}
-            pretrained_kwargs = {}
-            peft_quantization_kwargs = {}
-            token = None
-
-        if reward_adapter is not None and not isinstance(reward_adapter, str):
-            raise ValueError(
-                "The `reward_adapter` argument should be a string representing the name of local path or the Hub id to the Reward Modeling adapter."
-            )
-
-        is_peft_model = False
-
-        current_device = cls._get_current_device()
-        if isinstance(pretrained_model_name_or_path, str):
-            is_loaded_in_8bit = pretrained_kwargs["load_in_8bit"] if "load_in_8bit" in pretrained_kwargs else False
-            is_loaded_in_4bit = pretrained_kwargs["load_in_4bit"] if "load_in_4bit" in pretrained_kwargs else False
-        else:
-            is_loaded_in_8bit = getattr(pretrained_model_name_or_path, "is_loaded_in_8bit", False)
-            is_loaded_in_4bit = getattr(pretrained_model_name_or_path, "is_loaded_in_4bit", False)
-
-        if (is_loaded_in_8bit or is_loaded_in_4bit) and "device_map" not in pretrained_kwargs:
-            # warn users
-            logging.warning(
-                "The `device_map` argument is not provided. We will override the device_map argument."
-                " to set the entire"
-                " model on the current device. If you want to set the model on multiple devices, please provide"
-                " a custom `device_map` argument."
-            )
-            pretrained_kwargs["device_map"] = {"": current_device}
-
-        if peft_config is not None and not isinstance(peft_config, PeftConfig):
-            raise ValueError("The `peft_config` argument should be an instance of `peft.PeftConfig` class.")
-
-        # First, load the pre-trained model using the parent-class
-        # either `AutoModelForCausalLM` or `AutoModelForSeq2SeqLM`
-        if isinstance(pretrained_model_name_or_path, str):
-            remote_adapter_config = None
-
-            local_adapter_present = os.path.exists(os.path.join(pretrained_model_name_or_path, "adapter_config.json"))
-
-            if local_adapter_present or remote_adapter_config is not None:
-                if peft_config is not None:
-                    logging.warning(
-                        "`peft_config` argument ignored since a peft config file was found in "
-                        f"{pretrained_model_name_or_path}"
-                    )
-
-                # Load the trained peft adapter config
-                if local_adapter_present:
-                    trained_adapter_config = PeftConfig.from_pretrained(pretrained_model_name_or_path)
-                else:
-                    remote_adapter_dir = os.path.dirname(remote_adapter_config)
-                    trained_adapter_config = PeftConfig.from_pretrained(remote_adapter_dir)
-
-                # Load the pretrained base model
-                pretrained_model = cls.transformers_parent_class.from_pretrained(
-                    trained_adapter_config.base_model_name_or_path, *model_args, **pretrained_kwargs
-                )
-
-                # Wrap the pretrained model with the trained peft adapter
-                pretrained_model = PeftModel.from_pretrained(
-                    pretrained_model, pretrained_model_name_or_path, is_trainable=is_trainable, token=token
-                )
-                logging.info("Trained peft adapter loaded")
-            else:
-                pretrained_model = cls.transformers_parent_class.from_pretrained(
-                    pretrained_model_name_or_path, *model_args, **pretrained_kwargs
-                )
-
-                if peft_config is not None:
-                    # Initialize a new peft adapter with the given config
-                    # if is_loaded_in_8bit or is_loaded_in_4bit:
-                    #     pretrained_model = prepare_model_for_kbit_training(
-                    #         pretrained_model,
-                    #         **peft_quantization_kwargs,
-                    #     )
-                    pretrained_model = get_peft_model(pretrained_model, peft_config)
-                    logging.info("peft adapter initialised")
-
-        elif isinstance(pretrained_model_name_or_path, cls.supported_pretrained_model_architectures):
-            pretrained_model = pretrained_model_name_or_path
-
-            if peft_config is not None and isinstance(pretrained_model, PreTrainedModel):
-                # Initialize a new peft adapter with the given config
-                # if is_loaded_in_8bit or is_loaded_in_4bit:
-                #     pretrained_model = prepare_model_for_kbit_training(
-                #         pretrained_model,
-                #         **peft_quantization_kwargs,
-                #     )
-                pretrained_model = get_peft_model(pretrained_model, peft_config)
-                logging.info("peft adapter initialised")
-        else:
-            raise ValueError(
-                "pretrained_model_name_or_path should be a string or a PreTrainedModel, "
-                f"but is {type(pretrained_model_name_or_path)}"
-            )
-
-        if isinstance(pretrained_model, PeftModel):
-            is_peft_model = True
-            # for backward compatibility
-            if hasattr(pretrained_model, "active_peft_config") and isinstance(
-                pretrained_model.active_peft_config, PromptLearningConfig
-            ):
-                raise ValueError("PromptLearningConfig is not supported for PPO training.")
-
-        # Add reward modeling adapter if specified
-        if not is_peft_model and reward_adapter is not None:
-            raise ValueError("reward_adapter can only be used with a PeftModel. ")
-        elif is_peft_model and reward_adapter is not None:
-            score_module = cls.add_and_load_reward_modeling_adapter(
-                pretrained_model, reward_adapter, reward_adapter_name, token=token
-            )
-            multi_adapter_args = {
-                "score_module": score_module,
-                "supports_rm_adapter": True,
-                "rm_adapter_name": reward_adapter_name,
-            }
-        else:
-            multi_adapter_args = {"supports_rm_adapter": False}
-
-        # Then, create the full model by instantiating the wrapper class
-        model = cls(pretrained_model, **multi_adapter_args, **trl_model_args)
-
-        # if resume_training, load the state_dict again - this is ok since the
-        # state_dict is removed from the model after loading it.
-        is_resuming_training = True
-        if isinstance(pretrained_model_name_or_path, str):
-            safe_filename = os.path.join(pretrained_model_name_or_path, "model.safetensors")
-            filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
-
-            sharded_index_filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin.index.json")
-            safe_sharded_index_filename = os.path.join(pretrained_model_name_or_path, "model.safetensors.index.json")
-            is_sharded = False
-            use_safe = os.path.exists(safe_filename)
-
-            # if not (os.path.exists(filename) or os.path.exists(safe_filename)):
-            #     # Try with `pytorch_model.bin`
-            #     filename, files_to_download, is_sharded, is_resuming_training = cls._get_checkpoint_from_hub(
-            #         pretrained_model,
-            #         pretrained_model_name_or_path,
-            #         sharded_index_filename,
-            #         token=token,
-            #     )
-            #     # Try with safetensors
-            #     if filename is None and files_to_download is None:
-            #         safe_filename, files_to_download, is_sharded, is_resuming_training = cls._get_checkpoint_from_hub(
-            #             pretrained_model,
-            #             pretrained_model_name_or_path,
-            #             safe_sharded_index_filename,
-            #             token=token,
-            #             model_name="model.safetensors",
-            #             model_index_name="model.safetensors.index.json",
-            #         )
-            #         use_safe = True
-            #     else:
-            #         use_safe = False
-
-            loading_func = mindspore.load_checkpoint
-            load_kwargs = {}
-            if is_resuming_training:
-                # if is_sharded:
-                #     # download each file and add it to the state_dict
-                #     state_dict = {}
-
-                #     for shard_file in files_to_download:
-                #         filename = hf_hub_download(
-                #             pretrained_model_name_or_path,
-                #             shard_file,
-                #             token=token,
-                #         )
-                #         state_dict.update(loading_func(filename, **load_kwargs))
-                # else:
-                state_dict = loading_func(filename if not use_safe else safe_filename, **load_kwargs)
-
-        else:
-            state_dict = pretrained_model_name_or_path.state_dict()
-
-        model.is_peft_model = is_peft_model
-        model.current_device = current_device
-
-        if is_resuming_training:
-            model.post_init(state_dict=state_dict)
-
-        return model
-
-    # @classmethod
-    # def _get_checkpoint_from_hub(
-    #     cls,
-    #     pretrained_model,
-    #     pretrained_model_name_or_path,
-    #     index_filename,
-    #     token=None,
-    #     model_name="pytorch_model.bin",
-    #     model_index_name="pytorch_model.bin.index.json",
-    # ):
-    #     files_to_download = None
-    #     filename = None
-    #     is_resuming_training = True
-    #     is_sharded = False
-
-    #     try:
-    #         filename = hf_hub_download(
-    #             pretrained_model_name_or_path,
-    #             model_name,
-    #             token=token,
-    #         )
-    #     # sharded
-    #     except (EntryNotFoundError, LocalEntryNotFoundError, HFValidationError, RepositoryNotFoundError):
-    #         if os.path.exists(index_filename):
-    #             index_file_name = index_filename
-    #         else:
-    #             try:
-    #                 index_file_name = hf_hub_download(
-    #                     pretrained_model_name_or_path,
-    #                     model_index_name,
-    #                     token=token,
-    #                 )
-    #             except (EntryNotFoundError, LocalEntryNotFoundError, HFValidationError, RepositoryNotFoundError):
-    #                 # not continue training, do not have v_head weight
-    #                 is_resuming_training = False
-    #                 logging.warning(
-    #                     f"A {type(pretrained_model)} model is loaded from '{pretrained_model_name_or_path}', "
-    #                     f"and no v_head weight is found. This IS expected if you are not resuming PPO training."
-    #                 )
-    #         # load json
-    #         if is_resuming_training:
-    #             with open(index_file_name) as f:
-    #                 index = json.load(f)
-    #             # check filename with `v_head` or any known extra module:
-    #             files_to_download = set()
-    #             for k, v in index["weight_map"].items():
-    #                 if any(module in k for module in cls.supported_modules):
-    #                     files_to_download.add(v)
-    #             is_sharded = True
-
-    #     return filename, files_to_download, is_sharded, is_resuming_training
-
-    @classmethod
-    def _get_current_device(cls):
-        r"""
-        Get the current device. For GPU, we return the local process index using the `accelerate.PartialState`
-        object to handle corner cases when running scripts in distributed environments.
-
-        Returns:
-            current_device (`Union[int, str]`):
-                The current device.
-        """
-        # state = PartialState()
-        # if is_xpu_available():
-        #     return f"xpu:{state.local_process_index}"
-        # elif is_npu_available():
-        #     return f"npu:{state.local_process_index}"
-        # else:
-        #     return state.local_process_index if torch.cuda.is_available() else "cpu"
-        # return state.local_process_index if torch.cuda.is_available() else "cpu"
-        return 0
-
-    @classmethod
-    def _split_kwargs(cls, kwargs):
-        """
-        Separate the kwargs from the arguments that we support inside
-        `supported_args` and the ones that we don't.
-        """
-        check_peft_kwargs = False
-
-        supported_kwargs = {}
-        unsupported_kwargs = {}
-        peft_kwargs = {}
-
-        for key, value in kwargs.items():
-            if key in cls.supported_args:
-                supported_kwargs[key] = value
-            else:
-                unsupported_kwargs[key] = value
-
-        return supported_kwargs, unsupported_kwargs, peft_kwargs
-
-    @classmethod
-    def add_and_load_reward_modeling_adapter(
-        cls, pretrained_model, adapter_model_id, adapter_name="reward_model_adapter", token=None
-    ):
-        r"""
-        Add and load a reward modeling adapter. This method can only be used if the
-        model is a `PeftModel` and if you have initialized the model with the `reward_modeling_adapter_id`
-        argument, pointing to the id of the reward modeling adapter. The latest needs also to contain the
-        score head in order to produce the reward.
-        """
-        pretrained_model.load_adapter(adapter_model_id, adapter_name, is_trainable=False)
-        pretrained_model.train()
-
-        filename = os.path.join(adapter_model_id, "adapter_model.bin")
-        safe_loading = False
-        # if not os.path.exists(filename):
-        #     try:
-        #         local_filename = hf_hub_download(
-        #             adapter_model_id,
-        #             "adapter_model.bin",
-        #             token=token,
-        #         )
-        #     except Exception:
-        #         filename = os.path.join(adapter_model_id, "adapter_model.safetensors")
-        #         safe_loading = True
-        #         if not os.path.exists(filename):
-        #             try:
-        #                 local_filename = hf_hub_download(
-        #                     adapter_model_id,
-        #                     "adapter_model.safetensors",
-        #                     token=token,
-        #                 )
-        #             except Exception as exc:
-        #                 raise ValueError(
-        #                     "Could not find adapter model in the Hub, "
-        #                     "make sure you have the correct adapter model id."
-        #                 ) from exc
-        #         else:
-        #             local_filename = filename
-        # else:
-        #     local_filename = filename
-        # make for only local file.
-        local_filename = filename
-
-        loading_func = mindspore.load_checkpoint
-        load_kwargs = {}
-
-        adapter_state_dict = loading_func(local_filename, **load_kwargs)
-
-        for score_name_candidate in cls.supported_rm_modules:
-            if any(score_name_candidate in name for name in adapter_state_dict.keys()):
-                score_name = score_name_candidate
-                # we have found the correct head name and can break
-                break
-
-        score_dict = {}
-
-        for name, param in adapter_state_dict.items():
-            if score_name in name:
-                key_name = ".".join(name.split(".")[-1:])
-                score_dict[key_name] = param.to(cls._get_current_device())
-
-        num_labels, hidden_dim = score_dict["weight"].shape
-        has_bias = any("bias" in name for name in adapter_state_dict.keys())
-
-        score = nn.Linear(hidden_dim, num_labels, bias=has_bias).to(
-            device=cls._get_current_device(),
-            dtype=pretrained_model.dtype,
-        )
-        score.load_state_dict(score_dict)
-        for param in score.parameters():
-            param.requires_grad = False
-
-        return score
-
-    def push_to_hub(self, *args, **kwargs):
-        r"""
-        Push the pretrained model to the hub. This method is a wrapper around
-        `transformers.PreTrainedModel.push_to_hub`. Please refer to the documentation
-        of `transformers.PreTrainedModel.push_to_hub` for more information.
-
-        Args:
-            *args (`list`, *optional*):
-                Positional arguments passed along to the underlying model's
-                `push_to_hub` method.
-            **kwargs (`dict`, *optional*):
-                Keyword arguments passed along to the underlying model's
-                `push_to_hub` method.
-        """
-        raise NotImplementedError
-
-    def save_pretrained(self, *args, **kwargs):
-        r"""
-        Save the pretrained model to a directory. This method is a wrapper around
-        `transformers.PreTrainedModel.save_pretrained`. Please refer to the documentation
-        of `transformers.PreTrainedModel.save_pretrained` for more information.
-
-        Args:
-            *args (`list`, *optional*):
-                Positional arguments passed along to the underlying model's
-                `save_pretrained` method.
-            **kwargs (`dict`, *optional*):
-                Keyword arguments passed along to the underlying model's
-                `save_pretrained` method.
-        """
-        state_dict = kwargs.get("state_dict")
-        if state_dict is None:
-            state_dict = self.state_dict()
-            kwargs["state_dict"] = state_dict
-
-        # if it is a peft model only save the `v_head` state_dict and
-        # pop the `state_dict` from the kwargs to avoid slient bugs with `peft`
-        if self.is_peft_model:
-            save_path = args[0]
-            save_path = os.path.join(save_path, "pytorch_model.bin")
-            mindspore.save_checkpoint(state_dict, save_path)
-            _ = kwargs.pop("state_dict", None)
-
-        return self.pretrained_model.save_pretrained(*args, **kwargs)
-
-    def state_dict(self, *args, **kwargs):
-        r"""
-        Return the state_dict of the pretrained model.
-        """
-        raise NotImplementedError
-
-    def post_init(self, *args, **kwargs):
-        r"""
-        Post initialization method. This method is called after the model is
-        instantiated and loaded from a checkpoint. It can be used to perform
-        additional operations such as loading the state_dict.
-        """
-        raise NotImplementedError
-
-    def compute_reward_score(self, input_ids, attention_mask=None, **kwargs):
-        r"""
-        Computes the reward score for a given input. The method has first to enable the adapter
-        and then compute the reward score. After that the model disables the reward modeling
-        adapter and enables the default ppo adapter again.
-        """
-        if not self.supports_rm_adapter:
-            raise ValueError("This model does not support reward modeling adapter.")
-
-        # enable rm adapter
-        self.pretrained_model.set_adapter(self.rm_adapter_name)
-        self.pretrained_model.eval()
-
-        with mindspore._no_grad():
-            base_model_output = self.pretrained_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_hidden_states=True,
-                return_dict=True,
-                **kwargs,
-            )
-
-            last_hidden_states = base_model_output.hidden_states[-1]
-            scores = self.score(last_hidden_states)
-
-        self.pretrained_model.set_adapter(self.policy_adapter_name)
-        self.pretrained_model.eval()
-
-        return scores
-
-
-def create_reference_model(
-    model: PreTrainedModelWrapper, num_shared_layers: Optional[int] = None, pattern: Optional[str] = None
-) -> PreTrainedModelWrapper:
-    """
-    Creates a static reference copy of a model. Note that model will be in `.eval()` mode.
-
-    Args:
-        model (`PreTrainedModelWrapper`): The model to be copied.
-        num_shared_layers (`int`, *optional*): The number of initial layers that are shared between both models and kept frozen.
-        pattern (`str`, *optional*): The shared layers are selected with a string pattern
-            (e.g. "transformer.h.{layer}" for GPT2) and if a custom pattern is necessary it can be passed here.
-
-    Returns:
-        `PreTrainedModelWrapper`
-    """
-    # if is_deepspeed_zero3_enabled():
-    #     raise ValueError(
-    #         "DeepSpeed ZeRO-3 is enabled and is not compatible with `create_reference_model()`. Please instantiate your reference model directly with `AutoCausalLM.from_pretrained()`."
-    #     )
-
-    parameter_names = [n for n, _ in model.named_parameters()]
-    ref_model = deepcopy(model)
-
-    # if no layers are shared, return copy of model
-    if num_shared_layers is None:
-        for param_name in parameter_names:
-            param = ref_model.get_parameter(param_name)
-            param.requires_grad = False
-        return ref_model.eval()
-
-    # identify layer name pattern
-    if pattern is not None:
-        pattern = pattern.format(layer=num_shared_layers)
-    else:
-        for pattern_candidate in LAYER_PATTERNS:
-            pattern_candidate = pattern_candidate.format(layer=num_shared_layers)
-            if any(pattern_candidate in name for name in parameter_names):
-                pattern = pattern_candidate
-                break
-
-    if pattern is None:
-        raise ValueError("Layer pattern could not be matched.")
-
-    # divide parameters in shared and unshared parameter lists
-    shared_param_list = []
-    unshared_param_list = []
-
-    shared_parameter = True
-    for name, _param in model.named_parameters():
-        if pattern in name:
-            shared_parameter = False
-        if shared_parameter:
-            shared_param_list.append(name)
-        else:
-            unshared_param_list.append(name)
-
-    # create reference of the original parameter if they are shared
-    for param_name in shared_param_list:
-        param = model.get_parameter(param_name)
-        param.requires_grad = False
-
-        _ref_param = ref_model.get_parameter(param_name)
-
-    # for all other parameters just make sure they don't use gradients
-    for param_name in unshared_param_list:
-        param = ref_model.get_parameter(param_name)
-        param.requires_grad = False
-
-    if pattern is not None and len(unshared_param_list) == 0:
-        logging.warning("Pattern passed or found, but no layers matched in the model. Check for a typo.")
-
-    return ref_model.eval()
diff --git a/mindnlp/trl/readme.md b/mindnlp/trl/readme.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mindnlp/trl/trainer/__init__.py b/mindnlp/trl/trainer/__init__.py
deleted file mode 100644
index c4346ed43..000000000
--- a/mindnlp/trl/trainer/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""trainer init file."""
-# from .base import BaseTrainer
-from .dpo_trainer import DPOTrainer, _build_tokenized_answer, _truncate_tokens
-from .dpo_config import DPOConfig, FDivergenceType
diff --git a/mindnlp/trl/trainer/base.py b/mindnlp/trl/trainer/base.py
deleted file mode 100644
index 6cfa4aa44..000000000
--- a/mindnlp/trl/trainer/base.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# from huggingface_hub import PyTorchModelHubMixin
-
-
-# class BaseTrainer(PyTorchModelHubMixin):
-#     r"""
-#     Base class for all trainers - this base class implements the basic functions that we
-#     need for a trainer.
-
-#     The trainer needs to have the following functions:
-#         - step: takes in a batch of data and performs a step of training
-#         - loss: takes in a batch of data and returns the loss
-#         - compute_rewards: takes in a batch of data and returns the rewards
-#         - _build_models_and_tokenizer: builds the models and tokenizer
-#         - _build_dataset: builds the dataset
-#     Each user is expected to implement their own trainer class that inherits from this base
-#     if they want to use a new training algorithm.
-#     """
-
-#     def __init__(self, config):
-#         self.config = config
-
-#     def step(self, *args):
-#         raise NotImplementedError("Not implemented")
-
-#     def loss(self, *args):
-#         raise NotImplementedError("Not implemented")
-
-#     def compute_rewards(self, *args):
-#         raise NotImplementedError("Not implemented")
-
-#     def _save_pretrained(self, save_directory):
-#         raise NotImplementedError("Not implemented")
diff --git a/mindnlp/trl/trainer/callbacks.py b/mindnlp/trl/trainer/callbacks.py
deleted file mode 100644
index 13920c10c..000000000
--- a/mindnlp/trl/trainer/callbacks.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Trainer CallBacks Module."""
-from typing import Optional, Union
-
-import mindspore
-
-from mindnlp.engine.callbacks import TrainerCallback
-from ...transformers.modeling_utils import PreTrainedModel
-
-
-
-
-class SyncRefModelCallback(TrainerCallback):
-    """Sync Reference Model in training."""
-    def __init__(
-        self,
-        ref_model: Union[PreTrainedModel, mindspore.nn.Cell],
-        accelerator: Optional[None],
-    ):
-        self.accelerator = accelerator
-        self.ref_model = ref_model
-
-    @staticmethod
-    def _sync_target_model(model, target_model, alpha):
-        for target_param, copy_param in zip(target_model.parameters(), model.parameters()):
-            target_param.data.mul_(1.0 - alpha).add_(copy_param.data, alpha=alpha)
-
-    @staticmethod
-    def sync_target_model(model, target_model, alpha):
-        """sync the target model with training model."""
-        # deepspeed_plugin = AcceleratorState().deepspeed_plugin
-        # if deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3:
-        #     with deepspeed.zero.GatheredParameters(
-        #         list(model.parameters()) + list(target_model.parameters()), modifier_rank=0
-        #     ):
-        #         if deepspeed.comm.get_rank() == 0:
-        #             SyncRefModelCallback._sync_target_model(model, target_model, alpha)
-        # else:
-        SyncRefModelCallback._sync_target_model(model, target_model, alpha)
-
-    def on_step_end(self, args, state, control, **kwargs):
-        model: PreTrainedModel = kwargs["model"]
-
-        if self.ref_model is not None and state.global_step % args.ref_model_sync_steps == 0:
-            # if self.accelerator:
-            #     model = self.accelerator.unwrap_model(model)
-            self.sync_target_model(model, self.ref_model, args.ref_model_mixup_alpha)
diff --git a/mindnlp/trl/trainer/dpo_config.py b/mindnlp/trl/trainer/dpo_config.py
deleted file mode 100644
index 24b35fb03..000000000
--- a/mindnlp/trl/trainer/dpo_config.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""DPO Config."""
-import warnings
-from dataclasses import dataclass
-from enum import Enum
-from typing import Any, Dict, Literal, Optional
-
-from mindnlp.engine import TrainingArguments
-
-
-class FDivergenceType(Enum):
-    """FDivergenceType"""
-    REVERSE_KL = "reverse_kl"
-    JS_DIVERGENCE = "js_divergence"
-    ALPHA_DIVERGENCE = "alpha_divergence"
-
-
-class FDivergenceConstants:
-    """FDivergenceConstants."""
-    ALPHA_DIVERGENCE_COEF_KEY = "alpha_divergence_coef"
-    ALPHA_DIVERGENCE_COEF_DEFAULT = 1.0
-
-
-@dataclass
-class DPOConfig(TrainingArguments):
-    r"""
-    Configuration class for the [`DPOTrainer`].
-
-    Using [`~transformers.HfArgumentParser`] we can turn this class into
-    [argparse] arguments that can be specified on the command line.
-
-    Parameters:
-        beta (`float`, *optional*, defaults to `0.1`):
-            Parameter controlling the deviation from the reference model. Higher β means less 
-            deviation from the reference model. For the IPO loss (`loss_type="ipo"`), β is the 
-            regularization parameter denoted by τ in the 
-            [paper](https://huggingface.co/papers/2310.12036).
-        label_smoothing (`float`, *optional*, defaults to `0.0`):
-            Robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/
-            cdpo.pdf) report and [Robust DPO](https://huggingface.co/papers/2403.00409)
-            paper that should be between `0.0` and `0.5`.
-        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
-            Type of loss to use. Possible values are:
-
-                - `"sigmoid"`: sigmoid loss from the original 
-                [DPO](https://huggingface.co/papers/2305.18290) paper.
-                - `"hinge"`: hinge loss on the normalized likelihood from the 
-                [SLiC](https://huggingface.co/papers/2305.10425) paper.
-                - `"ipo"`: IPO loss from the 
-                [IPO](https://huggingface.co/papers/2310.12036) paper.
-                - `"exo_pair"`: pairwise EXO loss from the 
-                [EXO](https://huggingface.co/papers/2402.00856) paper.
-                - `"nca_pair"`: pairwise NCA loss from the 
-                [NCA](https://huggingface.co/papers/2402.05369) paper.
-                - `"robust"`: unbiased estimate of the DPO loss that is robust
-                to preference noise from the 
-                [Robust DPO](https://huggingface.co/papers/2403.00409) paper.
-                - `"bco_pair"`: pairwise BCO loss from the 
-                [BCO](https://huggingface.co/papers/2404.04656) paper.
-                - `"sppo_hard"`: SPPO loss with hard label from the 
-                [SPPO](https://huggingface.co/papers/2405.00675) paper.
-                - `"aot"`: AOT loss for paired datasets from the 
-                [AOT](https://huggingface.co/papers/2406.05882) paper.
-                - `"aot_pair"`: AOT loss for unpaired datasets from the 
-                [AOT](https://huggingface.co/papers/2406.05882) paper.
-                - `"apo_zero"`: APO-zero loss from the 
-                [APO](https://huggingface.co/papers/2408.06266) paper.
-                - `"apo_down"`: APO-down loss from the 
-                [APO](https://huggingface.co/papers/2408.06266) paper.
-
-        label_pad_token_id (`int`, *optional*, defaults to `-100`):
-            Label pad token id. This argument is required if you want to use the
-            default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
-            Padding value to use. If `None`, the padding value of the tokenizer
-            is used.
-        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
-            Truncation mode to use, either `keep_end` or `keep_start`. This argument 
-            is required if you want to use the default data collator.
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
-            Maximum length of the sequences (prompt + completion) in the batch.
-            This argument 
-            is required if you want to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
-            Maximum length of the prompt. This argument is required if you
-            want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
-            Maximum length of the target. This argument is required if you want
-            to use the default data collator and your model is an encoder-decoder.
-        is_encoder_decoder(`Optional[int]`, *optional*, defaults to `None`):
-            When using the `model_init` argument (callable) to instantiate the model 
-            instead of the `model` argument, you need to specify if the model
-            returned by the callable is an encoder-decoder model.
-        disable_dropout (`bool`, *optional*, defaults to `True`):
-            Whether to disable dropout in the model and reference model.
-        generate_during_eval (`bool`, *optional*, defaults to `False`):
-            Truncation mode to use when the prompt is too long. Possible values are 
-            `"keep_end"` or `"keep_start"`.This argument is required if you want to use 
-            the default data collator.
-        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
-            Whether to precompute reference model log probabilities for training and
-            evaluation datasets. This is useful when training without the reference
-            model to reduce the total GPU memory needed.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
-            Number of processes to use for processing the dataset.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
-            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained`
-            when instantiating the model from a string.
-        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
-            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when
-            instantiating the reference model from a string.
-        model_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
-            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
-        ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
-            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
-        reference_free (`bool`, *optional*, defaults to `False`):
-            If `True`, we ignore the _provided_ reference model and implicitly use a
-            reference model that assigns equal probability to all responses.
-        force_use_ref_model (`bool`, *optional*, defaults to `False`):
-            In case one passes a PEFT model for the active model and you want to
-            use a different model for the ref_model, set this flag to `True`.
-        f_divergence_type (`str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
-            Type of f-divergence regularization function to compute divergence between
-            policy and reference model.
-        f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`):
-            α coefficient in the α-divergence \\(u^{-\\alpha}\\) regularization
-            function for DPO loss.
-        sync_ref_model (`bool`, *optional*, defaults to `False`):
-            When set to `True`, the reference model is synchronized with the active
-            model every `ref_model_sync_steps` steps, using the `ref_model_mixup_alpha`
-            parameter. This synchronization originites from the
-            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
-        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.9`):
-            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper,
-            which controls the mix between the current policy and the previous reference
-            policy during updates. The reference policy is updated according to the equation:
-            `π_ref = α * π_θ + (1 - α) * π_ref_prev`To use this parameter,
-            you must set `sync_ref_model=True`.
-        ref_model_sync_steps (`int`, *optional*, defaults to `64`):
-            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper,
-            which determines how frequently the current policy is synchronized with
-            the reference policy. To use this parameter, you must set `sync_ref_model=True`.
-        rpo_alpha (`float`, *optional*, defaults to `None`):
-            α parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper (v3),
-            which controls the weighting of the NLL term in the loss. If `None`, no weighting
-            is applied and the loss is the same as the DPO loss. The paper recommends
-            `rpo_alpha=1.0`.
-    """
-
-    beta: float = 0.1
-    label_smoothing: float = 0.0
-    loss_type: Literal[
-        "sigmoid",
-        "hinge",
-        "ipo",
-        "exo_pair",
-        "nca_pair",
-        "robust",
-        "bco_pair",
-        "sppo_hard",
-        "aot",
-        "aot_pair",
-        "apo_zero",
-        "apo_down",
-    ] = "sigmoid"
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_target_length: Optional[int] = None  # deprecated in favor of max_completion_length
-    max_completion_length: Optional[int] = None
-    is_encoder_decoder: Optional[bool] = None
-    disable_dropout: bool = True
-    generate_during_eval: bool = False
-    precompute_ref_log_probs: bool = False
-    dataset_num_proc: Optional[int] = None
-    model_init_kwargs: Optional[Dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
-    model_adapter_name: Optional[str] = None
-    ref_adapter_name: Optional[str] = None
-    reference_free: bool = False
-    force_use_ref_model: bool = False
-    f_divergence_type: FDivergenceType = FDivergenceType.REVERSE_KL
-    f_alpha_divergence_coef: float = 1.0
-    sync_ref_model: bool = False
-    ref_model_mixup_alpha: float = 0.9
-    ref_model_sync_steps: int = 64
-    rpo_alpha: Optional[float] = None
-
-    def __post_init__(self):
-        if self.max_target_length is not None:
-            warnings.warn(
-                "The `max_target_length` argument is deprecated in favor of "
-                "`max_completion_length` and will be removed in a future version.",
-                FutureWarning,
-            )
-            if self.max_completion_length is None:
-                self.max_completion_length = self.max_target_length
-
-        return super().__post_init__()
diff --git a/mindnlp/trl/trainer/dpo_trainer.py b/mindnlp/trl/trainer/dpo_trainer.py
deleted file mode 100644
index 80c879cb8..000000000
--- a/mindnlp/trl/trainer/dpo_trainer.py
+++ /dev/null
@@ -1,1824 +0,0 @@
-# DPO Authors: Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano
-# Ermon, Christopher D. Manning, and Chelsea Finn 2023
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""DPO Trainer main class."""
-import inspect
-import random
-import warnings
-from collections import defaultdict
-from contextlib import contextmanager, nullcontext
-from copy import deepcopy
-from functools import partial
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, NewType
-
-
-import numpy
-from tqdm import tqdm
-
-import mindspore
-from mindspore import amp, nn, ops
-from mindspore import Tensor
-from mindspore.dataset import GeneratorDataset
-from huggingface_hub.utils._deprecation import _deprecate_arguments
-
-from ...transformers import (
-    AutoModelForCausalLM,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-)
-
-from ...engine import (
-    Trainer,
-)
-
-from ...transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
-from ...engine.callbacks import TrainerCallback
-from ...engine.utils import EvalLoopOutput
-from ..models import PreTrainedModelWrapper, create_reference_model
-from .callbacks import SyncRefModelCallback
-from .dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
-from .utils import (
-    DPODataCollatorWithPadding,
-    RunningMoments,
-    add_bos_token_if_needed,
-    add_eos_token_if_needed,
-    cap_exp,
-    disable_dropout_in_model,
-    pad_to_length,
-    peft_module_casting_to_bf16,
-)
-
-from ...peft import PeftModel, get_peft_model
-
-InputDataClass = NewType("InputDataClass", Any)
-DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
-
-
-def _tokenize(
-    prompt,
-    chosen,
-    rejected,
-    tokenizer: PreTrainedTokenizerBase,
-    args: DPOConfig,
-    processor: Optional[Callable] = None,
-    model: Optional[PreTrainedModel] = None,
-) -> Dict[str, List]:
-    """
-    Tokenizes and processes a batch of input features using the provided tokenizer and processor.
-    """
-    batch = defaultdict(list)
-    # as a patch when str is ndarray.
-    prompt = [prompt.tolist()]
-    chosen = [chosen.tolist()]
-    rejected = [rejected.tolist()]
-    if model is None:
-        images = [None] * len(prompt)
-
-        prompt_tokens = _process_prompt(prompt, processor, tokenizer, images)
-        chosen_tokens = _process_answer(prompt, chosen, processor, tokenizer, images)
-        rejected_tokens = _process_answer(prompt, rejected, processor, tokenizer, images)
-
-        prompt_len_input_ids = _adjust_prompt_length(prompt_tokens, chosen_tokens, rejected_tokens)
-        prompt_tokens, chosen_tokens, rejected_tokens = _add_special_tokens(
-            tokenizer, prompt_len_input_ids, prompt_tokens, chosen_tokens, rejected_tokens
-        )
-
-        _truncate_tokens(chosen_tokens, rejected_tokens, prompt_tokens, args)
-
-        _build_sequence_tokens(batch, chosen_tokens, args, "chosen")
-        _build_sequence_tokens(batch, rejected_tokens, args, "rejected")
-
-        _append_prompt_tokens_to_batch(batch, prompt_tokens)
-
-    else:
-        _tokenize_encoder_decoder(
-            batch, tokenizer, prompt, chosen, rejected, args, model
-        )
-
-    features = dict(batch)
-
-    return features
-
-def _process_prompt(
-    prompts: List[str],
-    processor: Optional[Callable],
-    tokenizer: PreTrainedTokenizerBase,
-    images: List[Optional[Any]]
-) -> List[Dict[str, List[int]]]:
-    """
-    Processes a list of prompts by tokenizing them,
-    optionally using a processor for additional processing.
-    """
-    if processor:
-        processor_kwargs = (
-            {"add_special_tokens": False}
-            if "add_special_tokens" in inspect.signature(processor).parameters
-            else {}
-        )
-        prompt_tokens = []
-        for prompt, image in zip(prompts, images):
-            tokens = processor(prompt, images=image, **processor_kwargs)
-            tokens = {k: v[0] for k, v in tokens.items()}
-            if not isinstance(tokens["input_ids"], list):
-                tokens["input_ids"] = tokens["input_ids"].tolist()
-                tokens["attention_mask"] = tokens["attention_mask"].tolist()
-            prompt_tokens.append(tokens)
-    else:
-        prompt_tokens = [tokenizer(prompt, add_special_tokens=False)
-                         for prompt in prompts]
-    return [{f"prompt_{k}": v for k, v in tokens.items()}
-            for tokens in prompt_tokens]
-
-
-def _process_answer(
-    prompts: List[str],
-    answers: List[str],
-    processor: Optional[Callable],
-    tokenizer: PreTrainedTokenizerBase,
-    images: List[Optional[Any]],
-) -> List[Dict[str, Any]]:
-    return [
-        _build_tokenized_answer(
-            prompt, answer, image, processor=processor, tokenizer=tokenizer
-        )
-        for prompt, answer, image in zip(prompts, answers, images)
-    ]
-
-
-def _adjust_prompt_length(
-    prompt_tokens: List[Dict[str, List[int]]],
-    chosen_tokens: List[Dict[str, List[int]]],
-    rejected_tokens: List[Dict[str, List[int]]],
-) -> List[int]:
-    prompt_len_input_ids = []
-    for p_tokens, c_tokens, r_tokens in zip(prompt_tokens, chosen_tokens, rejected_tokens):
-        c_len = len(c_tokens["prompt_input_ids"])
-        r_len = len(r_tokens["prompt_input_ids"])
-        min_len = min(c_len, r_len)
-
-        for k, v in p_tokens.items():
-            p_tokens[k] = v[:min_len]
-
-        num_diff_tokens = sum(a != b for a, b in zip(c_tokens["prompt_input_ids"], r_tokens["prompt_input_ids"]))
-        num_diff_len = abs(c_len - r_len)
-        if num_diff_tokens > 1 or num_diff_len > 1:
-            raise ValueError(
-                "Chosen and rejected prompt_input_ids might only differ on the"
-                "last token due to tokenizer merge ops."
-            )
-        prompt_len_input_ids.append(min_len)
-    return prompt_len_input_ids
-
-
-def _add_special_tokens(
-    tokenizer: PreTrainedTokenizerBase,
-    prompt_len_input_ids: List[int],
-    prompt_tokens: List[Dict[str, List[int]]],
-    chosen_tokens: List[Dict[str, List[int]]],
-    rejected_tokens: List[Dict[str, List[int]]],
-) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
-    for i in range(len(prompt_tokens)):
-        prompt_tokens[i], chosen_tokens[i], rejected_tokens[i] = add_bos_token_if_needed(
-            tokenizer.bos_token_id,
-            prompt_len_input_ids[i],
-            prompt_tokens[i],
-            len(chosen_tokens[i]["prompt_input_ids"]),
-            chosen_tokens[i],
-            len(rejected_tokens[i]["prompt_input_ids"]),
-            rejected_tokens[i],
-        )
-
-        chosen_tokens[i], rejected_tokens[i] = add_eos_token_if_needed(
-            tokenizer.eos_token_id, chosen_tokens[i], rejected_tokens[i]
-        )
-    return prompt_tokens, chosen_tokens, rejected_tokens
-
-
-def _truncate_tokens(
-    chosen_tokens: List[Dict[str, List[int]]],
-    rejected_tokens: List[Dict[str, List[int]]],
-    prompt_tokens: List[Dict[str, List[int]]],
-    args: DPOConfig,
-) -> None:
-    """
-    Truncates the tokens in chosen, rejected, and prompt sequences to ensure they
-    fit within the maximum length constraints.
-    """
-    if args.truncation_mode not in ["keep_start", "keep_end"]:
-        raise ValueError(f"Invalid truncation mode: {args.truncation_mode}")
-
-    for c_tokens, r_tokens, p_tokens in zip(chosen_tokens, rejected_tokens, prompt_tokens):
-        longer_response_length = max(len(c_tokens["input_ids"]), len(r_tokens["input_ids"]))
-
-        # if combined sequence is too long, truncate the prompt
-        for answer_tokens in [c_tokens, r_tokens, p_tokens]:
-            if len(answer_tokens["prompt_input_ids"]) + longer_response_length > args.max_length:
-                if args.truncation_mode == "keep_start":
-                    for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                        answer_tokens[k] = answer_tokens[k][: args.max_prompt_length]
-                elif args.truncation_mode == "keep_end":
-                    for k in ["prompt_input_ids", "prompt_attention_mask"]:
-                        answer_tokens[k] = answer_tokens[k][-args.max_prompt_length :]
-
-        # if that's still too long, truncate the response from the end
-        for answer_tokens in [c_tokens, r_tokens]:
-            if len(answer_tokens["prompt_input_ids"]) + longer_response_length > args.max_length:
-                for k in ["input_ids", "attention_mask"]:
-                    answer_tokens[k] = answer_tokens[k][: args.max_length - args.max_prompt_length]
-
-
-def _build_sequence_tokens(
-    batch: Dict[str, List[int]],
-    tokens: List[Dict[str, List[int]]],
-    args: DPOConfig, prefix: str
-) -> None:
-    for token in tokens:
-        sequence_tokens = {f"{prefix}_{k}": token[f"prompt_{k}"] + token[k]
-                           for k in ["input_ids", "attention_mask"]}
-        sequence_tokens[f"{prefix}_labels"] = sequence_tokens[f"{prefix}_input_ids"][:]
-        sequence_tokens[f"{prefix}_labels"][: len(token["prompt_input_ids"])] = \
-            [args.label_pad_token_id] * len(token["prompt_input_ids"])
-        for k, v in sequence_tokens.items():
-            batch[k].append(v)
-
-
-def _append_prompt_tokens_to_batch(
-    batch: Dict[str, List[int]],
-    prompt_tokens: List[Dict[str, List[int]]]
-    ) -> None:
-    for p_tokens in prompt_tokens:
-        for k, v in p_tokens.items():
-            batch[k].append(v)
-
-
-def _tokenize_encoder_decoder(
-    batch: Dict[str, List[int]],
-    tokenizer: PreTrainedTokenizerBase,
-    prompt: List[str],
-    chosen: List[str],
-    rejected: List[str],
-    args: DPOConfig,
-    model: Optional[PreTrainedModel],
-) -> None:
-    chosen_tokens = tokenizer(
-        chosen,
-        truncation=True,
-        max_length=args.max_completion_length,
-        add_special_tokens=True
-    )
-    rejected_tokens = tokenizer(
-        rejected,
-        truncation=True,
-        max_length=args.max_completion_length,
-        add_special_tokens=True
-    )
-    prompt_tokens = tokenizer(
-        prompt,
-        truncation=True,
-        max_length=args.max_prompt_length,
-        add_special_tokens=True
-    )
-
-    batch["chosen_labels"] = chosen_tokens["input_ids"]
-    batch["rejected_labels"] = rejected_tokens["input_ids"]
-    batch["prompt_input_ids"] = prompt_tokens["input_ids"]
-    batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
-
-    if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        # Ensure the sequences are of the same length
-        max_length = max(len(seq) for seq in batch["chosen_labels"] + batch["rejected_labels"])
-        batch["chosen_labels"] = [
-            seq + [tokenizer.pad_token_id] * (max_length - len(seq))
-            for seq in batch["chosen_labels"]
-        ]
-        batch["rejected_labels"] = [
-            seq + [tokenizer.pad_token_id] * (max_length - len(seq))
-            for seq in batch["rejected_labels"]
-        ]
-
-        batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-            labels=mindspore.tensor(batch["rejected_labels"])
-        )
-        batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
-            labels=mindspore.tensor(batch["chosen_labels"])
-        )
-
-
-def _build_tokenized_answer(
-    prompt: str,
-    answer: str,
-    images: Optional[List[Any]] = None,
-    processor: Optional[Callable] = None,
-    tokenizer: Optional[PreTrainedTokenizerBase] = None,
-) -> Dict[str, Any]:
-    """
-    Build tokenized response, handling vision models and different tokenizers.
-    """
-
-    def tokenize(text, images=None):
-        if processor:
-            processor_kwargs = (
-                {"add_special_tokens": False}
-                if "add_special_tokens" in inspect.signature(processor).parameters
-                else {}
-            )
-            tokenized = processor(text, images=images, **processor_kwargs)
-            tokenized = {k: v[0] for k, v in tokenized.items()}
-            if not isinstance(tokenized["input_ids"], list):
-                tokenized["input_ids"] = tokenized["input_ids"].tolist()
-                tokenized["attention_mask"] = tokenized["attention_mask"].tolist()
-        else:
-            tokenized = tokenizer(text, add_special_tokens=False)
-        return tokenized
-
-    full_tokenized = tokenize(prompt + answer, images)
-    prompt_tokenized = tokenize(prompt, images)
-
-    prompt_input_ids = prompt_tokenized["input_ids"]
-    answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
-    answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
-
-    if len(full_tokenized["input_ids"]) != len(prompt_input_ids + answer_input_ids):
-        raise ValueError("Prompt input ids and answer input ids should have the same length.")
-
-    # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
-    # can be merged together when tokenizing prompt+answer. This could result
-    # on the last token from the prompt being different when tokenized on its own
-    # vs when done as prompt+answer.
-    response_token_ids_start_idx = len(prompt_input_ids)
-
-    # If tokenized prompt is different than both prompt+answer, then it means the
-    # last token has changed due to merging.
-    if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
-        response_token_ids_start_idx -= 1
-
-    prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
-    prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
-
-    if len(prompt_input_ids) != len(prompt_attention_mask):
-        raise ValueError("Prompt input ids and attention mask should have the same length.")
-
-    return_dict = {
-        "prompt_input_ids": prompt_input_ids,
-        "prompt_attention_mask": prompt_attention_mask,
-        "input_ids": answer_input_ids,
-        "attention_mask": answer_attention_mask,
-    }
-    if "pixel_values" in full_tokenized:
-        return_dict["prompt_pixel_values"] = full_tokenized["pixel_values"]
-    if "pixel_attention_mask" in full_tokenized:
-        return_dict["prompt_pixel_attention_mask"] = full_tokenized["pixel_attention_mask"]
-
-    return return_dict
-
-
-class DPOTrainer(Trainer):
-    r"""
-    Initialize DPOTrainer.
-
-    Args:
-        model (`transformers.PreTrainedModel`):
-            The model to train, preferably an `AutoModelForSequenceClassification`.
-        ref_model (`PreTrainedModelWrapper`):
-            Hugging Face transformer model with a casual language modelling head.
-            Used for implicit reward computation and loss. If no
-            reference model is provided, the trainer will create a reference model
-            with the same architecture as the model to be optimized.
-        args (`DPOConfig`):
-            The DPO config arguments to use for training.
-        data_collator (`transformers.DataCollator`):
-            The data collator to use for training. If None is specified,
-            the default data collator (`DPODataCollatorWithPadding`) will be used
-            which will pad the sequences to the maximum length of the sequences
-            in the batch, given a dataset of paired sequences.
-        train_dataset (`datasets.Dataset`):
-            The dataset to use for training.
-        eval_dataset (`datasets.Dataset`):
-            The dataset to use for evaluation.
-        tokenizer (`transformers.PreTrainedTokenizerBase`):
-            The tokenizer to use for training. This argument is required if you
-            want to use the default data collator.
-        model_init (`Callable[[], transformers.PreTrainedModel]`):
-            The model initializer to use for training. If None is specified, the
-            default model initializer will be used.
-        callbacks (`List[transformers.TrainerCallback]`):
-            The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
-            The optimizer and scheduler to use for training.
-        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor],torch.Tensor]`):
-            The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
-            The PEFT configuration to use for training. If you pass a PEFT configuration,
-            the model will be wrapped in a PEFT model.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
-            The function to use to compute the metrics. Must take a `EvalPrediction` and return
-            a dictionary string to metric values.
-    """
-
-    _tag_names = ["trl", "dpo"]
-
-    @_deprecate_arguments(
-        version="1.0.0",
-        deprecated_args=[
-            "beta",
-            "label_smoothing",
-            "loss_type",
-            "label_pad_token_id",
-            "padding_value",
-            "truncation_mode",
-            "max_length",
-            "max_prompt_length",
-            "max_target_length",
-            "is_encoder_decoder",
-            "disable_dropout",
-            "generate_during_eval",
-            "precompute_ref_log_probs",
-            "dataset_num_proc",
-            "model_init_kwargs",
-            "ref_model_init_kwargs",
-            "model_adapter_name",
-            "ref_adapter_name",
-            "reference_free",
-            "force_use_ref_model",
-        ],
-        custom_message="Deprecated positional argument(s) used in DPOTrainer,"
-        "please use the DPOConfig to set these arguments instead.",
-    )
-    def __init__(
-        self,
-        model: Optional[Union[PreTrainedModel, nn.Cell, str]] = None,
-        ref_model: Optional[Union[PreTrainedModel, nn.Cell, str]] = None,
-        beta: float = 0.1,
-        label_smoothing: float = 0,
-        loss_type: Optional[str] = None,
-        args: Optional[DPOConfig] = None,
-        data_collator: Optional[DataCollator] = None, # type: ignore
-        label_pad_token_id: int = -100,
-        padding_value: Optional[int] = None,
-        truncation_mode: str = "keep_end",
-        train_dataset: Optional[GeneratorDataset] = None,
-        eval_dataset: Optional[Union[GeneratorDataset, Dict[str, GeneratorDataset]]] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[mindspore.nn.Optimizer, Any] = (None, None),
-        preprocess_logits_for_metrics: \
-        Optional[Callable[[mindspore.Tensor, mindspore.Tensor], mindspore.Tensor]] = None,
-        max_length: Optional[int] = None,
-        max_prompt_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        peft_config: Optional[Dict] = None,
-        is_encoder_decoder: Optional[bool] = None,
-        disable_dropout: bool = True,
-        generate_during_eval: bool = False,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
-        precompute_ref_log_probs: bool = False,
-        dataset_num_proc: Optional[int] = None,
-        model_init_kwargs: Optional[Dict] = None,
-        ref_model_init_kwargs: Optional[Dict] = None,
-        model_adapter_name: Optional[str] = None,
-        ref_adapter_name: Optional[str] = None,
-        reference_free: bool = False,
-        force_use_ref_model: bool = False,
-    ):
-        if model_init_kwargs is not None:
-            warnings.warn(
-                "You passed `model_init_kwargs` to the DPOTrainer, the value you passed"
-                "will override the one in the `DPOConfig`."
-            )
-            args.model_init_kwargs = model_init_kwargs
-
-        if args.model_init_kwargs is None:
-            model_init_kwargs = {}
-        elif not isinstance(model, str):
-            raise ValueError(
-                "You passed model_init_kwargs to the DPOTrainer/DPOConfig, but your model"
-                "is already instantiated."
-            )
-        else:
-            model_init_kwargs = args.model_init_kwargs
-            ms_dtype = model_init_kwargs.get("ms_dtype")
-            if ms_dtype is not None:
-                # Convert to `torch.dtype` if an str is passed
-                if isinstance(ms_dtype, str) and ms_dtype != "auto":
-                    ms_dtype = getattr(mindspore, ms_dtype)
-                model_init_kwargs["ms_dtype"] = ms_dtype
-
-        if ref_model_init_kwargs is not None:
-            warnings.warn(
-                "You passed `ref_model_init_kwargs` to the DPOTrainer, the value you passed"
-                "will override the one in the `DPOConfig`."
-            )
-            args.ref_model_init_kwargs = ref_model_init_kwargs
-
-        if args.ref_model_init_kwargs is None:
-            ref_model_init_kwargs = {}
-        elif not isinstance(ref_model, str):
-            raise ValueError(
-                "You passed ref_model_init_kwargs to the DPOTrainer/DPOConfig, but your"
-                "ref_model is already instantiated."
-            )
-        else:
-            ref_model_init_kwargs = args.ref_model_init_kwargs
-            ms_dtype = ref_model_init_kwargs.get("ms_dtype")
-            if ms_dtype is not None:
-                # Convert to `torch.dtype` if an str is passed
-                if isinstance(ms_dtype, str) and ms_dtype != "auto":
-                    ms_dtype = getattr(mindspore, ms_dtype)
-                ref_model_init_kwargs["ms_dtype"] = ms_dtype
-
-        if isinstance(model, str):
-            warnings.warn(
-                "You passed a model_id to the DPOTrainer. This will automatically create an "
-                "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you."
-            )
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
-
-        if isinstance(ref_model, str):
-            warnings.warn(
-                "You passed a ref model_id to the DPOTrainer. This will automatically create an "
-                "`AutoModelForCausalLM`"
-            )
-            ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
-
-        # Initialize this variable to False.
-        # This helps tracking the case when `peft_module_casting_to_bf16`
-        # has been called in order to properly call autocast if needed.
-        self._peft_has_been_casted_to_bf16 = False
-
-        if force_use_ref_model:
-            warnings.warn(
-                "You passed `force_use_ref_model` to the DPOTrainer, the value you passed will"
-                "override the one in the `DPOConfig`."
-            )
-            args.force_use_ref_model = force_use_ref_model
-
-
-        if peft_config is not None:
-            # if model is a peft model and we have a peft_config, we merge and unload it first
-            if isinstance(model, PeftModel):
-                model = model.merge_and_unload()
-
-            if ref_model is not None and not args.force_use_ref_model:
-                raise ValueError(
-                    "You passed both a ref_model and a peft_config. For"
-                    "training PEFT adapters with DPO there is no need to"
-                    "pass a reference model.Please pass `ref_model=None`"
-                    "in case you want to train PEFT adapters, or pass a"
-                    "ref_model with `force_use_ref_model=True`in DPOTrainer's"
-                    "init.if you want to use a different ref_model."
-                )
-
-            if getattr(model, "is_loaded_in_8bit", False) or \
-                getattr(model, "is_loaded_in_4bit", False):
-                pass
-            elif getattr(args, "gradient_checkpointing", False):
-                # For backward compatibility with older versions of transformers
-                if hasattr(model, "enable_input_require_grads"):
-                    model.enable_input_require_grads()
-                else:
-
-                    def make_inputs_require_grad(module, input, output):
-                        output.requires_grad_(True)
-
-                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-            # get peft model with the given config
-            model = get_peft_model(model, peft_config)
-            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
-                peft_module_casting_to_bf16(model)
-                # If args.bf16 we need to explicitly call `generate`
-                # with torch amp autocast context manager
-                self._peft_has_been_casted_to_bf16 = True
-
-        # For models that use gradient_checkpointing, we need to attach a hook that enables input
-        # to explicitly have `requires_grad=True`, otherwise training will either silently
-        # fail or completely fail.
-        elif getattr(args, "gradient_checkpointing", False):
-            # For backward compatibility with older versions of transformers
-            if hasattr(model, "enable_input_require_grads"):
-                model.enable_input_require_grads()
-            else:
-
-                def make_inputs_require_grad(module, input, output):
-                    output.requires_grad_(True)
-
-                model.get_input_embeddings().register_forward_hook(
-                    make_inputs_require_grad)
-
-        if generate_during_eval:
-            warnings.warn(
-                "You passed `generate_during_eval` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.generate_during_eval = generate_during_eval
-
-        if is_encoder_decoder is not None:
-            warnings.warn(
-                "You passed `is_encoder_decoder` to the DPOTrainer, the value you passed"
-                "will override the one in the `DPOConfig`."
-            )
-            args.is_encoder_decoder = is_encoder_decoder
-        if model is not None:
-            self.is_encoder_decoder = model.config.is_encoder_decoder
-        elif args.is_encoder_decoder is None:
-            raise ValueError(
-                "When no model is provided, you need to pass the parameter"
-                "is_encoder_decoder to the DPOTrainer/DPOConfig."
-            )
-        else:
-            self.is_encoder_decoder = args.is_encoder_decoder
-
-        if model is not None:
-            self.is_vision_model = model.config.model_type in \
-                MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys()
-        else:
-            warnings.warn(
-                "No model provided, cannot determine if it is a vision model."
-                "Setting is_vision_model to False."
-            )
-            self.is_vision_model = False
-
-        if self.is_vision_model:
-            self.processor = tokenizer
-            self.tokenizer = tokenizer.tokenizer  # tokenizer is actually a processor at this point
-        else:
-            self.tokenizer = tokenizer
-
-        self.is_peft_model = isinstance(model, PeftModel)
-        if model_adapter_name is not None:
-            warnings.warn(
-                "You passed `model_adapter_name` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.model_adapter_name = model_adapter_name
-        self.model_adapter_name = args.model_adapter_name
-
-        if ref_adapter_name is not None:
-            warnings.warn(
-                "You passed `ref_adapter_name` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.ref_adapter_name = ref_adapter_name
-        self.ref_adapter_name = args.ref_adapter_name
-
-        if reference_free:
-            warnings.warn(
-                "You passed `reference_free` to the DPOTrainer, the value you passed"
-                "will override the one in the `DPOConfig`."
-            )
-            args.reference_free = reference_free
-        self.reference_free = args.reference_free
-
-        if precompute_ref_log_probs:
-            warnings.warn(
-                "You passed `precompute_ref_log_probs` to the DPOTrainer, the value"
-                "you passed will override the one in the `DPOConfig`."
-            )
-            args.precompute_ref_log_probs = precompute_ref_log_probs
-
-        if ref_model:
-            self.ref_model = ref_model
-        elif self.is_peft_model or args.precompute_ref_log_probs:
-            # The `model` with adapters turned off will be used as the reference model
-            self.ref_model = None
-        else:
-            self.ref_model = create_reference_model(model)
-
-        if tokenizer is None:
-            raise ValueError("tokenizer must be specified to tokenize a DPO dataset.")
-
-        if max_length is not None:
-            warnings.warn(
-                "You passed `max_length` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.max_length = max_length
-        if args.max_length is None:
-            warnings.warn(
-                "`max_length` is not set in the DPOConfig's init"
-                " it will default to `512` by default, but you should do"
-                "it yourself in the future.",
-                UserWarning,
-            )
-            args.max_length = 512
-
-        if max_prompt_length is not None:
-            warnings.warn(
-                "You passed `max_prompt_length` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.max_prompt_length = max_prompt_length
-        if args.max_prompt_length is None:
-            warnings.warn(
-                "`max_prompt_length` is not set in the DPOConfig's init"
-                " it will default to `128` by default, but you should do"
-                "it yourself in the future.",
-                UserWarning,
-            )
-            args.max_prompt_length = 128
-
-        if max_target_length is not None:
-            warnings.warn(
-                "You passed `max_target_length` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.max_completion_length = max_target_length
-        if args.max_completion_length is None and self.is_encoder_decoder:
-            warnings.warn(
-                "When using an encoder decoder architecture, you should set"
-                "`max_completion_length` in the DPOConfig's init"
-                " it will default to `128` by default, but you should do"
-                "it yourself in the future.",
-                UserWarning,
-            )
-            args.max_completion_length = 128
-
-        if label_pad_token_id != -100:
-            warnings.warn(
-                "You passed `label_pad_token_id` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.label_pad_token_id = label_pad_token_id
-        if data_collator is None:
-            data_collator = DPODataCollatorWithPadding(
-                pad_token_id=self.tokenizer.pad_token_id,
-                label_pad_token_id=args.label_pad_token_id,
-                is_encoder_decoder=self.is_encoder_decoder,
-            )
-
-            if args.remove_unused_columns:
-                args.remove_unused_columns = False
-                # warn users
-                warnings.warn(
-                    "When using DPODataCollatorWithPadding, you should"
-                    "set `remove_unused_columns=False` in your TrainingArguments"
-                    " we have set it for you, but you should do it yourself"
-                    "in the future.",
-                    UserWarning,
-                )
-
-            self.use_dpo_data_collator = True
-        else:
-            self.use_dpo_data_collator = False
-
-        if not disable_dropout:
-            warnings.warn(
-                "You passed `disable_dropout` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.disable_dropout = disable_dropout
-        if args.disable_dropout:
-            disable_dropout_in_model(model)
-            if self.ref_model is not None:
-                disable_dropout_in_model(self.ref_model)
-
-        self.max_length = args.max_length
-        self.generate_during_eval = args.generate_during_eval
-        self.label_pad_token_id = args.label_pad_token_id
-        if padding_value is not None:
-            warnings.warn(
-                "You passed `padding_value` to the DPOTrainer, the value"
-                "you passed will override the one in the `DPOConfig`."
-            )
-            args.padding_value = padding_value
-        self.padding_value = args.padding_value if padding_value is not None \
-            else self.tokenizer.pad_token_id
-        self.max_prompt_length = args.max_prompt_length
-        if truncation_mode != "keep_end":
-            warnings.warn(
-                "You passed `truncation_mode` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.truncation_mode = truncation_mode
-        self.truncation_mode = args.truncation_mode
-        self.max_completion_length = args.max_completion_length
-        self.precompute_ref_log_probs = args.precompute_ref_log_probs
-
-        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
-        # keep track of first called to avoid computation of future calls
-        self._precomputed_train_ref_log_probs = False
-        self._precomputed_eval_ref_log_probs = False
-
-        if loss_type is not None:
-            warnings.warn(
-                "You passed `loss_type` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.loss_type = loss_type
-        if label_smoothing != 0:
-            warnings.warn(
-                "You passed `label_smoothing` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.label_smoothing = label_smoothing
-        if (
-            args.loss_type in ["hinge", "ipo", "bco_pair", \
-                "sppo_hard", "nca_pair", "apo_zero", "apo_down"]
-            and args.label_smoothing > 0
-        ):
-            warnings.warn(
-                "You are using a loss type that does not support label"
-                "smoothing. Ignoring label_smoothing parameter."
-            )
-        if args.loss_type == "kto_pair":
-            raise ValueError("Support for kto_pair has been removed in"
-                             "DPOTrainer. Please use KTOTrainer.")
-
-        if beta != 0.1:
-            warnings.warn(
-                "You passed `beta` to the DPOTrainer, the value you"
-                "passed will override the one in the `DPOConfig`."
-            )
-            args.beta = beta
-        self.beta = args.beta
-        self.label_smoothing = args.label_smoothing
-        self.loss_type = args.loss_type
-        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
-
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-
-        self.f_divergence_type = args.f_divergence_type
-        self.f_divergence_params = \
-        {FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY: args.f_alpha_divergence_coef}
-
-        if dataset_num_proc is not None:
-            warnings.warn(
-                "You passed `dataset_num_proc` to the DPOTrainer, the"
-                "value you passed will override the one in the `DPOConfig`."
-            )
-            args.dataset_num_proc = dataset_num_proc
-        self.dataset_num_proc = args.dataset_num_proc
-
-        # Compute that only on the main process for faster data processing.
-        # see: https://github.com/huggingface/trl/pull/1255
-        # with PartialState().local_main_process_first():
-            # tokenize the dataset, lower writer batch size to avoid OOM (frequent in vision models)
-        fn_kwargs = {
-            "tokenizer": self.tokenizer,
-            "args": args,
-            "processor": self.processor if self.is_vision_model else None,
-            "model": model if self.is_encoder_decoder else None,
-        }
-        fn_tokenize = partial(_tokenize, **fn_kwargs)
-
-        train_dataset = train_dataset.map(
-            fn_tokenize,
-            input_columns=['prompt', 'chosen', 'rejected'],
-            output_columns='features',
-            num_parallel_workers=self.dataset_num_proc,
-        )
-        batched_train_dataset = train_dataset.batch(
-            args.per_device_train_batch_size,
-            False,
-            1,
-            per_batch_map=data_collator,
-            input_columns=['features'],
-        )
-        if eval_dataset is not None:
-            eval_dataset = eval_dataset.map(
-                fn_tokenize,
-                input_columns=['prompt', 'chosen', 'rejected'],
-                output_columns='features',
-                num_parallel_workers=self.dataset_num_proc,
-            )
-            batched_eval_dataset = eval_dataset.batch(
-            args.per_device_eval_batch_size,
-            False,
-            1,
-            per_batch_map=data_collator,
-            input_columns=['features'],
-        )
-
-        super().__init__(
-            model=model,
-            args=args,
-            # map_fn=data_collator,
-            train_dataset=batched_train_dataset,
-            eval_dataset=batched_eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        # Add tags for models that have been loaded with the correct transformers version
-        if hasattr(self.model, "add_model_tags"):
-            self.model.add_model_tags(self._tag_names)
-
-        if self.ref_model is None:
-            if not (self.is_peft_model or self.precompute_ref_log_probs):
-                raise ValueError(
-                    "No reference model and model is not a Peft model. Try"
-                    "setting `precompute_ref_log_probs=True`"
-                )
-            if args.sync_ref_model:
-                raise ValueError(
-                    "You currently cannot use `ref_model=None` with"
-                    "TR-DPO method. Please provide `ref_model`."
-                )
-
-        if args.sync_ref_model:
-            if precompute_ref_log_probs:
-                raise ValueError(
-                    "You cannot use `precompute_ref_log_probs=True` with"
-                    "TR-DPO method. Please set `precompute_ref_log_probs=False`."
-                )
-
-            self.add_callback(SyncRefModelCallback(
-                ref_model=self.ref_model,
-                accelerator=self.accelerator))
-        if self.loss_type == "bco_pair":
-            self.running = RunningMoments(self.accelerator)
-
-    def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
-        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
-        config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
-
-        if model is not None:
-            if hasattr(model, "config"):
-                hidden_size = (
-                    max(model.config.hidden_sizes)
-                    if getattr(model.config, "hidden_sizes", None)
-                    else getattr(model.config, "hidden_size", None)
-                )
-                if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
-                    # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed
-                    # messages like: `Invalidate trace cache @ step 0: expected
-                    # module 1, but got module 0`This is expected and is not an
-                    # error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
-                    config_kwargs.update(
-                        {
-                            "zero_optimization.reduce_bucket_size":
-                            hidden_size * hidden_size,
-                            "zero_optimization.stage3_param_persistence_threshold":
-                            10 * hidden_size,
-                            "zero_optimization.stage3_prefetch_bucket_size":
-                            0.9 * hidden_size * hidden_size,
-                        }
-                    )
-
-        # If ZeRO-3 is used, we shard both the active and reference model.
-        # Otherwise, we assume the reference model fits in memory and is
-        # initialized on each device with ZeRO disabled (stage 0)
-        if config_kwargs["zero_optimization"]["stage"] != 3:
-            config_kwargs["zero_optimization"]["stage"] = 0
-        # model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
-        model.eval()
-        return model
-
-    def get_train_dataloader(self) -> GeneratorDataset:
-        """
-        Returns the training [`~torch.utils.data.GeneratorDataset`].
-
-        Subclass of transformers.src.transformers.trainer.get_train_dataloader
-        to precompute `ref_log_probs`.
-        """
-
-        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
-            dataloader_params = {
-                "batch_size": self.args.per_device_train_batch_size,
-                "collate_fn": self.data_collator,
-                "num_workers": self.args.dataloader_num_workers,
-                "pin_memory": self.args.dataloader_pin_memory,
-                "shuffle": False,
-            }
-
-            # prepare dataloader
-            data_loader = self.accelerator.prepare(
-                GeneratorDataset(self.train_dataset, **dataloader_params))
-
-            reference_chosen_logps = []
-            reference_rejected_logps = []
-            for padded_batch in tqdm(
-                iterable=data_loader,
-                desc="Train dataset reference log probs"
-            ):
-                reference_chosen_logp, reference_rejected_logp = \
-                    self.compute_reference_log_probs(padded_batch)
-                # reference_chosen_logp, reference_rejected_logp = self.accelerator.gather_for_metrics(
-                #     (reference_chosen_logp, reference_rejected_logp)
-                # )
-                reference_chosen_logps.append(reference_chosen_logp.cpu())
-                reference_rejected_logps.append(reference_rejected_logp.cpu())
-
-                # Unnecessary cache clearing to avoid OOM
-                # torch.cuda.empty_cache()
-                # self.accelerator.free_memory()
-
-            all_reference_chosen_logps = mindspore.ops.cat(
-                reference_chosen_logps).float().numpy()
-            all_reference_rejected_logps = mindspore.ops.cat(
-                reference_rejected_logps).float().numpy()
-
-            self.train_dataset = self.train_dataset.add_column(
-                name="reference_chosen_logps", column=all_reference_chosen_logps
-            )
-            self.train_dataset = self.train_dataset.add_column(
-                name="reference_rejected_logps", column=all_reference_rejected_logps
-            )
-
-            self._precomputed_train_ref_log_probs = True
-
-        return super().get_train_dataloader()
-
-    def get_eval_dataloader(
-            self,
-            eval_dataset: Optional[GeneratorDataset] = None
-        ) -> GeneratorDataset:
-        """
-        Returns the evaluation [`~torch.utils.data.GeneratorDataset`].
-
-        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to
-        precompute `ref_log_probs`.
-
-        Args:
-            eval_dataset (`torch.utils.data.Dataset`, *optional*):
-                If provided, will override `self.eval_dataset`. If it is a
-                [`~datasets.Dataset`], columns not accepted by the `model.forward()'
-                method are automatically removed. It must implement `__len__`.
-        """
-        if eval_dataset is None and self.eval_dataset is None:
-            raise ValueError("Trainer: evaluation requires an eval_dataset.")
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-
-        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
-            dataloader_params = {
-                "batch_size": self.args.per_device_eval_batch_size,
-                "collate_fn": self.data_collator,
-                "num_workers": self.args.dataloader_num_workers,
-                "pin_memory": self.args.dataloader_pin_memory,
-                "shuffle": False,
-            }
-
-            # prepare dataloader
-            data_loader = self.accelerator.prepare(
-                GeneratorDataset(eval_dataset, **dataloader_params))
-
-            reference_chosen_logps = []
-            reference_rejected_logps = []
-            for padded_batch in tqdm(
-                iterable=data_loader,
-                desc="Eval dataset reference log probs"
-            ):
-                reference_chosen_logp, reference_rejected_logp = \
-                    self.compute_reference_log_probs(padded_batch)
-                reference_chosen_logp, reference_rejected_logp = \
-                    self.accelerator.gather_for_metrics(
-                    (reference_chosen_logp, reference_rejected_logp))
-                reference_chosen_logps.append(reference_chosen_logp.cpu())
-                reference_rejected_logps.append(reference_rejected_logp.cpu())
-
-            all_reference_chosen_logps = mindspore.ops.cat(
-                reference_chosen_logps).float().numpy()
-            all_reference_rejected_logps = mindspore.ops.cat(
-                reference_rejected_logps).float().numpy()
-
-            eval_dataset = eval_dataset.add_column(
-                name="reference_chosen_logps", column=all_reference_chosen_logps
-            )
-            eval_dataset = eval_dataset.add_column(
-                name="reference_rejected_logps", column=all_reference_rejected_logps
-            )
-
-            # Save calculated reference_chosen_logps and reference_rejected_logps
-            # to the eval_dataset for subsequent runs
-            if self.eval_dataset is not None:
-                self.eval_dataset = eval_dataset
-            self._precomputed_eval_ref_log_probs = True
-
-        return super().get_eval_dataloader(eval_dataset=eval_dataset)
-
-    @contextmanager
-    def null_ref_context(self):
-
-        def model_disable_adapter(model):
-            for cell in model.modules():
-                cell.enable_adapters()
-
-        """Context manager for handling null reference model (that is,
-        peft adapter manipulation)."""
-        # with model_disable_adapter(self.model) if self.is_peft_model
-        # and not self.ref_adapter_name else nullcontext():
-        with nullcontext():
-            if self.ref_adapter_name:
-                self.model.set_adapter(self.ref_adapter_name)
-            yield
-            if self.ref_adapter_name:
-                self.model.set_adapter(self.model_adapter_name or "default")
-
-
-    def compute_reference_log_probs(self, padded_batch: Dict) -> Dict:
-        """Computes log probabilities of the reference model for
-        a single padded batch of a DPO specific dataset."""
-        compte_ref_context_manager = amp.autocast("cuda") \
-        if self._peft_has_been_casted_to_bf16 else nullcontext()
-
-        # compute reference logps
-        with mindspore._no_grad(), compte_ref_context_manager:
-            if self.ref_model is None:
-                with self.null_ref_context():
-                    reference_chosen_logps, reference_rejected_logps = self.concatenated_forward(
-                        self.model, padded_batch
-                    )[:2]
-            else:
-                reference_chosen_logps, reference_rejected_logps = self.concatenated_forward(
-                    self.ref_model, padded_batch
-                )[:2]
-
-        return reference_chosen_logps, reference_rejected_logps
-
-    @staticmethod
-    def concatenated_inputs(
-        batch: Dict[str, Union[List, mindspore.Tensor]],
-        is_encoder_decoder: bool = False,
-        is_vision_model: bool = False,
-        label_pad_token_id: int = -100,
-        padding_value: int = 0,
-        device = None,
-    ) -> Dict[str, mindspore.Tensor]:
-        """Concatenate the chosen and rejected inputs into a single tensor.
-
-        Args:
-            batch: A batch of data. Must contain the keys 'chosen_input_ids'
-            and 'rejected_input_ids', which are tensors of shape (batch_size, sequence_length).
-            is_encoder_decoder: Whether the model is an encoder-decoder model.
-            label_pad_token_id: The label pad token id.
-            padding_value: The padding value to use for the concatenated inputs_ids.
-            device: The device for the concatenated inputs.
-
-        Returns:
-            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
-        """
-        concatenated_batch = {}
-        if is_encoder_decoder:
-            max_length = max(
-                batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1]
-            )
-        else:
-            max_length = max(
-                batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]
-            )
-
-        for k in batch:
-            if k.startswith("chosen") and isinstance(batch[k], mindspore.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("chosen", "concatenated")
-                concatenated_batch[concatenated_key] = pad_to_length(
-                    batch[k], max_length, pad_value=pad_value
-                )
-        for k in batch:
-            if k.startswith("rejected") and isinstance(batch[k], mindspore.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("rejected", "concatenated")
-                concatenated_batch[concatenated_key] = Tensor.from_numpy(numpy.concatenate(
-                    (
-                        concatenated_batch[concatenated_key].numpy(),
-                        pad_to_length(batch[k], max_length, pad_value=pad_value).numpy(),
-                    ),
-                    axis=0,
-                ))
-
-        if is_encoder_decoder:
-            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1)
-            concatenated_batch["concatenated_attention_mask"] = (
-                batch["prompt_attention_mask"].repeat(2, 1)
-            )
-            concatenated_batch["concatenated_decoder_input_ids"] = mindspore.ops.cat(
-                [batch["chosen_decoder_input_ids"], batch["rejected_decoder_input_ids"]], axis=0
-            )
-
-        if is_vision_model:
-            concatenated_batch["pixel_values"] = mindspore.ops.cat(
-                [batch["prompt_pixel_values"], batch["prompt_pixel_values"]], axis=0
-            )
-            if "prompt_pixel_attention_mask" in batch:
-                concatenated_batch["pixel_attention_mask"] = mindspore.ops.cat(
-                    [batch["prompt_pixel_attention_mask"],
-                     batch["prompt_pixel_attention_mask"]],
-                     axis=0
-                )
-        return concatenated_batch
-
-    def dpo_loss(
-        self,
-        policy_chosen_logps,
-        policy_rejected_logps,
-        reference_chosen_logps,
-        reference_rejected_logps,
-    ) -> Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]:
-        """Compute the DPO loss for a batch of policy and reference model log probabilities.
-
-        Args:
-            policy_chosen_logps: Log probabilities of the policy model for the
-            chosen responses. Shape: (batch_size,)
-            policy_rejected_logps: Log probabilities of the policy model for the
-            rejected responses. Shape: (batch_size,)
-            reference_chosen_logps: Log probabilities of the reference model for the
-            chosen responses. Shape: (batch_size,)
-            reference_rejected_logps: Log probabilities of the reference model for the
-            rejected responses. Shape: (batch_size,)
-
-        Returns:
-            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
-            The losses tensor contains the DPO loss for each example in the batch.
-            The chosen_rewards and rejected_rewards tensors contain the rewards for
-            the chosen and rejected responses, respectively.
-        """
-        chosen_logratios = policy_chosen_logps - (
-            not self.reference_free
-        ) * reference_chosen_logps
-        rejected_logratios = policy_rejected_logps - (
-            not self.reference_free
-        ) * reference_rejected_logps
-        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
-            # The alpha-divergence formula: (1 - u^-alpha) / alpha
-            # The divergence difference between the chosen and rejected sample is:
-            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
-            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
-            # where u[w] and u[l] are the policy/reference probability ratios
-            # for the chosen and rejected samples, respectively.
-            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
-            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY \
-                in self.f_divergence_params:
-                alpha_coef = float(
-                    self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
-            logits = (
-                cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)
-                ) / alpha_coef
-        else:
-            pi_logratios = policy_chosen_logps - policy_rejected_logps
-            if self.reference_free:
-                ref_logratios = mindspore.tensor([0], dtype=pi_logratios.dtype)
-            else:
-                ref_logratios = reference_chosen_logps - reference_rejected_logps
-
-            # pi_logratios = pi_logratios.to(self.accelerator.device)
-            # ref_logratios = ref_logratios.to(self.accelerator.device)
-            logits = pi_logratios - ref_logratios
-
-            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
-                # The js-divergence formula: log(2 * u / (1 + u))
-                # The divergence difference between the chosen and rejected sample is:
-                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
-                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
-                # where u[w] and u[l] are the policy/reference probability ratios
-                # for the chosen and rejected samples, respectively.
-                logits -= ops.softplus(chosen_logratios) - ops.softplus(rejected_logratios)
-
-        # The beta is a temperature parameter for the DPO loss, typically something
-        #  in the range of 0.1 to 0.5.We ignore the reference model as beta -> 0.
-        # The label_smoothing parameter encodes our uncertainty about the labels and
-        # calculates a conservative DPO loss.
-
-        if self.loss_type == "sigmoid":
-            losses = (
-                - ops.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
-                - ops.logsigmoid(-self.beta * logits) * self.label_smoothing
-            )
-        elif self.loss_type == "robust":
-            losses = (
-                -ops.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
-                + ops.logsigmoid(-self.beta * logits) * self.label_smoothing
-            ) / (1 - 2 * self.label_smoothing)
-        elif self.loss_type == "exo_pair":
-            # eqn (16) of the EXO paper: https://huggingface.co/papers/2402.00856
-            import math
-
-            if self.label_smoothing == 0:
-                self.label_smoothing = 1e-3
-            losses = (self.beta * logits).sigmoid() * (
-                ops.logsigmoid(self.beta * logits) - math.log(1 - self.label_smoothing)
-            ) + (-self.beta * logits).sigmoid() * (
-                ops.logsigmoid(-self.beta * logits) - math.log(self.label_smoothing))
-        elif self.loss_type == "hinge":
-            losses = mindspore.ops.relu(1 - self.beta * logits)
-        elif self.loss_type == "ipo":
-            # eqn (17) of the paper where beta is the regularization parameter for
-            # the IPO loss, denoted by tau in the paper.
-            losses = (logits - 1 / (2 * self.beta)) ** 2
-        elif self.loss_type == "bco_pair":
-            chosen_logratios = policy_chosen_logps - reference_chosen_logps
-            rejected_logratios = policy_rejected_logps - reference_rejected_logps
-
-            chosen_rewards = self.beta * chosen_logratios
-            rejected_rewards = self.beta * rejected_logratios
-            rewards = mindspore.ops.cat((chosen_rewards, rejected_rewards), 0).mean()
-            self.running.update(rewards)
-            delta = self.running.mean
-
-            losses = -ops.logsigmoid((self.beta * chosen_logratios) - delta) - ops.logsigmoid(
-                -(self.beta * rejected_logratios - delta)
-            )
-        elif self.loss_type == "sppo_hard":
-            # In the paper (https://huggingface.co/papers/2405.00675),
-            # SPPO employs a soft probability approach, estimated using the PairRM score.
-            # The probability calculation is conducted outside of the trainer class. The
-            # version described here is the hard probability version, where P in Equation
-            # (4.7) of Algorithm 1 is set to 1 for the winner and 0 for the loser.
-            a = policy_chosen_logps - reference_chosen_logps
-            b = policy_rejected_logps - reference_rejected_logps
-
-            losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
-        elif self.loss_type == "nca_pair":
-            chosen_rewards = (policy_chosen_logps - reference_chosen_logps) * self.beta
-            rejected_rewards = (policy_rejected_logps - reference_rejected_logps) * self.beta
-            losses = (
-                -ops.logsigmoid(chosen_rewards)
-                - 0.5 * ops.logsigmoid(-chosen_rewards)
-                - 0.5 * ops.logsigmoid(-rejected_rewards)
-            )
-        elif self.loss_type == "aot_pair":
-            chosen_logratios = policy_chosen_logps - reference_chosen_logps
-            rejected_logratios = policy_rejected_logps - reference_rejected_logps
-
-            chosen_logratios_sorted, _ = mindspore.sort(chosen_logratios, dim=0)
-            rejected_logratios_sorted, _ = mindspore.sort(rejected_logratios, dim=0)
-
-            delta = chosen_logratios_sorted - rejected_logratios_sorted
-
-            losses = (
-                -ops.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
-                - ops.logsigmoid(-self.beta * delta) * self.label_smoothing
-            )
-
-        elif self.loss_type == "aot":
-            pi_logratios = policy_chosen_logps - policy_rejected_logps
-            ref_logratios = reference_chosen_logps - reference_rejected_logps
-
-            pi_logratios_sorted, _ = mindspore.sort(pi_logratios, dim=0)
-            ref_logratios_sorted, _ = mindspore.sort(ref_logratios, dim=0)
-
-            delta = pi_logratios_sorted - ref_logratios_sorted
-
-            losses = (
-                -ops.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
-                - ops.logsigmoid(-self.beta * delta) * self.label_smoothing
-            )
-
-        elif self.loss_type == "apo_zero":
-            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
-            # Use this loss when you believe the chosen outputs are better than
-            # your model's default output
-            # Increase chosen likelihood
-            losses_chosen = 1 - ops.sigmoid(self.beta * chosen_logratios)
-            # Decrease rejected likelihood
-            losses_rejected = ops.sigmoid(self.beta * rejected_logratios)
-
-            losses = losses_chosen + losses_rejected
-
-        elif self.loss_type == "apo_down":
-            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
-            # Use this loss when you believe the chosen outputs are worse than
-            # your model's default output
-
-            losses_chosen = ops.sigmoid(self.beta * chosen_logratios)
-            # Decrease chosen likelihood
-            losses_rejected = 1 - ops.sigmoid(
-                self.beta * (chosen_logratios - rejected_logratios)
-            )  # Decrease rejected likelihood more
-
-            losses = losses_chosen + losses_rejected
-
-        else:
-            raise ValueError(
-                f"Unknown loss type: {self.loss_type}."
-                "Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', 'nca_pair',"
-                "'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'apo_zero', 'apo_down']"
-            )
-        chosen_rewards = (
-            self.beta
-            * (
-                policy_chosen_logps - reference_chosen_logps
-            )
-        )
-        rejected_rewards = (
-            self.beta
-            * (
-                policy_rejected_logps
-                - reference_rejected_logps
-            )
-        )
-        return losses, chosen_rewards, rejected_rewards
-
-    @staticmethod
-    def get_batch_logps(
-        logits,
-        labels,
-        label_pad_token_id: int = -100,
-        is_encoder_decoder: bool = False,
-    ) -> Tuple[mindspore.Tensor, mindspore.Tensor]:
-        """Compute the log probabilities of the given labels under the given logits.
-
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length,
-            vocab_size)
-            labels: Labels for which to compute the log probabilities.
-            Label tokens with a value of label_pad_token_id are ignored.
-            Shape:  (batch_size, sequence_length)
-            label_pad_token_id: The label pad token id.
-            is_encoder_decoder: Whether the model is an encoder-decoder model.
-
-        Returns:
-            A Tuple of two tensor of shape ((batch_size,), (batch_size,))
-            containing the sum of log probabilities of the given labels under
-            the given logits in the first tensor and the number of non-masked
-            tokens in the second tensor.
-        """
-        if logits.shape[:-1] != labels.shape:
-            raise ValueError(
-                f"Logits (batch and sequence length dim) {logits.shape[:-1]} and "
-                "labels must have the same shape {labels.shape}."
-            )
-
-        if not is_encoder_decoder:
-            labels = labels[:, 1:].copy()
-            logits = logits[:, :-1, :]
-        loss_mask = labels != label_pad_token_id
-
-        # dummy token; we'll ignore the losses on these tokens later
-        labels[labels == label_pad_token_id] = 0
-        per_token_logps = mindspore.ops.gather_elements(
-            logits, dim=2, index=labels.unsqueeze(2)).squeeze(2)
-
-        return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1)
-
-    def concatenated_forward(
-        self,
-        model: nn.Cell,
-        batch: Dict[str, Union[List, mindspore.Tensor]]
-    ) -> Tuple[
-        mindspore.Tensor, mindspore.Tensor, mindspore.Tensor,
-        mindspore.Tensor, mindspore.Tensor]:
-        """Run the given model on the given batch of inputs, concatenating
-        the chosen and rejected inputs together.
-        We do this to avoid doing two forward passes, because it's faster for FSDP.
-        """
-        batch = batch['features']
-        concatenated_batch = self.concatenated_inputs(
-            batch,
-            is_encoder_decoder=self.is_encoder_decoder,
-            is_vision_model=self.is_vision_model,
-            label_pad_token_id=self.label_pad_token_id,
-            padding_value=self.padding_value,
-            # device=self.accelerator.device,
-        )
-        len_chosen = batch["chosen_labels"].shape[0]
-        model_kwargs = {}
-
-        if self.is_encoder_decoder:
-            model_kwargs["labels"] = concatenated_batch["concatenated_labels"]
-            model_kwargs["decoder_input_ids"] = concatenated_batch.get(
-                "concatenated_decoder_input_ids")
-
-        if self.is_vision_model:
-            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
-            if "pixel_attention_mask" in concatenated_batch:
-                model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"]
-
-        if self.aux_loss_enabled:
-            model_kwargs["output_router_logits"] = True
-        outputs = model(
-            concatenated_batch["concatenated_input_ids"],
-            attention_mask=concatenated_batch["concatenated_attention_mask"],
-            use_cache=False,
-            **model_kwargs,
-        )
-        all_logits = outputs.logits
-        if all_logits.shape[:2] != concatenated_batch["concatenated_labels"].shape[:2]:
-            # for llava, the model returns logits for the entire sequence,
-            # including the image tokens (placed before the text tokens)
-            seq_len = concatenated_batch["concatenated_labels"].shape[1]
-            all_logits = all_logits[:, -seq_len:]
-
-        all_logps, size_completion = self.get_batch_logps(
-            all_logits,
-            concatenated_batch["concatenated_labels"],
-            # average_log_prob=self.loss_type == "ipo",
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
-
-        labels = concatenated_batch["concatenated_labels"].copy()
-        # replace the old cross_entropy_func.
-        computed_logits = all_logits[:len_chosen]
-        computed_labels = labels[:len_chosen]
-        if not self.is_encoder_decoder:
-            # Shift so that tokens < n predict n
-            computed_logits = computed_logits[..., :-1, :].contiguous()
-            computed_labels = computed_labels[..., 1:].contiguous()
-            # Flatten the tokens
-        loss_fct = nn.CrossEntropyLoss(ignore_index=self.label_pad_token_id)
-        computed_logits = computed_logits.view(-1, computed_logits.shape[-1])
-        computed_labels = computed_labels.view(-1)
-        # Enable model parallelism
-        computed_labels = computed_labels.to(mindspore.int32)
-        nll_loss = loss_fct(computed_logits, computed_labels)
-        if self.loss_type == "ipo":
-            all_logps = all_logps / size_completion
-        chosen_logps = all_logps[:len_chosen]
-        rejected_logps = all_logps[len_chosen:]
-
-        chosen_logits = all_logits[:len_chosen]
-        rejected_logits = all_logits[len_chosen:]
-
-        if self.aux_loss_enabled:
-            return (chosen_logps, rejected_logps, chosen_logits,\
-                    rejected_logits, nll_loss, outputs.aux_loss)
-
-        return (chosen_logps, rejected_logps, chosen_logits,\
-                rejected_logits, nll_loss)
-
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: Dict[str, Union[List, mindspore.Tensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the DPO loss and other metrics for the given batch
-        of inputs for train or test."""
-        metrics = {}
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-        # if reference_chosen_logps and reference_rejected_logps in
-        # batch use them, otherwise use the reference model
-        if (
-            "reference_chosen_logps" in batch
-            and "reference_rejected_logps" in batch
-            and (self.precompute_ref_log_probs or self.args.rpo_alpha is not None)
-        ):
-            reference_chosen_logps = batch["reference_chosen_logps"]
-            reference_rejected_logps = batch["reference_rejected_logps"]
-        else:
-            with mindspore._no_grad():
-                if self.ref_model is None:
-                    with self.null_ref_context():
-                        reference_chosen_logps, reference_rejected_logps = \
-                            self.concatenated_forward(
-                                self.model, batch
-                        )[:2]
-                else:
-                    reference_chosen_logps, reference_rejected_logps = \
-                        self.concatenated_forward(
-                            self.ref_model, batch
-                    )[:2]
-        losses, chosen_rewards, rejected_rewards = self.dpo_loss(
-            policy_chosen_logps,
-            policy_rejected_logps,
-            reference_chosen_logps,
-            reference_rejected_logps,
-        )
-
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        if self.args.rpo_alpha is not None:
-            # RPO loss from V3 of the paper:
-            losses = losses + policy_nll_loss * self.args.rpo_alpha
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
-        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
-        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
-        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
-        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.mean()
-        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.mean()
-        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.mean()
-        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.mean()
-        if self.args.rpo_alpha is not None:
-            metrics[f"{prefix}nll_loss"] = policy_nll_loss.mean()
-
-        if self.aux_loss_enabled:
-            return losses.mean() + getattr(
-                model.config, "router_aux_loss_coef", 0.0) * aux_loss, metrics
-        return losses.mean(), metrics
-
-    def compute_loss(
-        self,
-        model: Union[PreTrainedModel, nn.Cell],
-        inputs: Dict[str, Union[mindspore.Tensor, Any]],
-        return_outputs=False,
-    ) -> Union[mindspore.Tensor, Tuple[mindspore.Tensor, Dict[str, mindspore.Tensor]]]:
-        if not self.use_dpo_data_collator:
-            warnings.warn(
-                "compute_loss is only implemented for DPODataCollatorWithPadding,"
-                "and you passed a datacollator that is different than "
-                "DPODataCollatorWithPadding - you might see unexpected behavior."
-                "Alternatively, you can implement your own prediction_step method "
-                "if you are using a custom data collator"
-            )
-
-        compute_loss_context_manager = amp.autocast("cuda") \
-        if self._peft_has_been_casted_to_bf16 else nullcontext()
-        with compute_loss_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
-
-        # Make sure to move the loss to the device the original
-        # accumulating loss is at back in the `Trainer` class:
-        # loss = loss.to(self.args.device)
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="train")
-
-        if return_outputs:
-            return (loss, metrics)
-        return loss
-
-    def get_batch_samples(self, model, batch: Dict[str, mindspore.Tensor]) -> Tuple[str, str]:
-        """Generate samples from the model and reference model for the given batch of inputs."""
-
-        # If one uses `generate_during_eval` with peft + bf16, we need
-        # to explicitly call generate with the torch cuda amp context
-        # manager as some hidden states are silently casted to full precision.
-        generate_context_manager = amp.autocast("cuda") if self._peft_has_been_casted_to_bf16 else nullcontext()
-
-        with generate_context_manager:
-            policy_output = model.generate(
-                input_ids=batch["prompt_input_ids"],
-                attention_mask=batch["prompt_attention_mask"],
-                max_length=self.max_length,
-                do_sample=True,
-                pad_token_id=self.tokenizer.pad_token_id,
-            )
-
-            # if reference_output in batch use that otherwise use the reference model
-            if "reference_output" in batch:
-                reference_output = batch["reference_output"]
-            else:
-                if self.ref_model is None:
-                    with self.null_ref_context():
-                        reference_output = self.model.generate(
-                            input_ids=batch["prompt_input_ids"],
-                            attention_mask=batch["prompt_attention_mask"],
-                            max_length=self.max_length,
-                            do_sample=True,
-                            pad_token_id=self.tokenizer.pad_token_id,
-                        )
-                else:
-                    reference_output = self.ref_model.generate(
-                        input_ids=batch["prompt_input_ids"],
-                        attention_mask=batch["prompt_attention_mask"],
-                        max_length=self.max_length,
-                        do_sample=True,
-                        pad_token_id=self.tokenizer.pad_token_id,
-                    )
-
-        policy_output = pad_to_length(
-            policy_output, self.max_length, self.tokenizer.pad_token_id)
-        policy_output_decoded = self.tokenizer.batch_decode(
-            policy_output, skip_special_tokens=True)
-
-        reference_output = pad_to_length(
-            reference_output, self.max_length, self.tokenizer.pad_token_id)
-        reference_output_decoded = self.tokenizer.batch_decode(
-            reference_output, skip_special_tokens=True)
-
-        return policy_output_decoded, reference_output_decoded
-
-    def prediction_step(
-        self,
-        model: Union[PreTrainedModel, nn.Cell],
-        inputs: Dict[str, Union[mindspore.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ):
-        if not self.use_dpo_data_collator:
-            warnings.warn(
-                "prediction_step is only implemented for DPODataCollatorWithPadding,"
-                "and you passed a datacollator that is different than DPODataCollatorWithPadding -"
-                "you might see unexpected behavior. Alternatively, you can implement your own"
-                "prediction_step method if you are using a custom data collator"
-            )
-        if ignore_keys is None:
-            if hasattr(model, "config"):
-                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
-            else:
-                ignore_keys = []
-
-        prediction_context_manager = amp.autocast("cuda") \
-            if self._peft_has_been_casted_to_bf16 else nullcontext()
-
-        with mindspore._no_grad(), prediction_context_manager:
-            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
-
-        # force log the metrics
-        self.store_metrics(metrics, train_eval="eval")
-
-        if prediction_loss_only:
-            return (loss, None, None)
-
-        # logits for the chosen and rejected samples from model
-        logits_dict = {
-            "eval_logits/chosen": metrics["eval_logits/chosen"],
-            "eval_logits/rejected": metrics["eval_logits/rejected"],
-        }
-        logits = tuple(v.unsqueeze(dim=0) for k, v in logits_dict.items() if k not in ignore_keys)
-        logits = mindspore.ops.stack(logits).mean(axis=1).to(self.accelerator.device)
-        labels = mindspore.ops.zeros(logits.shape[0])
-
-        return (loss, logits, labels)
-
-    def store_metrics(
-        self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
-    ) -> None:
-        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
-
-    def evaluation_loop(
-        self,
-        dataloader: GeneratorDataset,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Overriding built-in evaluation loop to store metrics for each batch.
-        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
-
-        Works both with or without labels.
-        """
-
-        # Sample and save to game log if requested (for one batch to save time)
-        if self.generate_during_eval:
-            # Generate random indices within the range of the total number of samples
-            num_samples = len(dataloader.dataset)
-            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
-
-            # Use dataloader.dataset.select to get the random batch without
-            # iterating over the GeneratorDataset
-            random_batch_dataset = dataloader.dataset.select(random_indices)
-            random_batch = self.data_collator(random_batch_dataset)
-            random_batch = self._prepare_inputs(random_batch)
-
-            # policy_output_decoded, ref_output_decoded = self.get_batch_samples(
-            #     self.model, random_batch)
-
-            # self.log(
-            #     {
-            #         "game_log": wandb.Table(
-            #             columns=["Prompt", "Policy", "Ref Model"],
-            #             rows=[
-            #                 [prompt, pol[len(prompt) :], ref[len(prompt) :]]
-            #                 for prompt, pol, ref in zip(
-            #                     random_batch["prompt"], policy_output_decoded, ref_output_decoded
-            #                 )
-            #             ],
-            #         )
-            #     }
-            # )
-            self.state.log_history.pop()
-
-        # Base evaluation
-        initial_output = super().evaluation_loop(
-            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
-        )
-
-        return initial_output
-
-    def log(self, logs: Dict[str, float]) -> None:
-        """
-        Log `logs` on the various objects watching training, including stored metrics.
-
-        Args:
-            logs (`Dict[str, float]`):
-                The values to log.
-        """
-        # logs either has 'loss' or 'eval_loss'
-        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = mindspore.tensor(metrics).mean().item()
-        del self._stored_metrics[train_eval]
-        return super().log(logs)
diff --git a/mindnlp/trl/trainer/model_config.py b/mindnlp/trl/trainer/model_config.py
deleted file mode 100644
index 10e69e95e..000000000
--- a/mindnlp/trl/trainer/model_config.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Trainer Model Config."""
-from dataclasses import dataclass
-from typing import List, Literal, Optional
-
-
-@dataclass
-class ModelConfig:
-    """
-    Configuration class for the models.
-
-    Using [`~transformers.HfArgumentParser`] we can turn this class into
-    [argparse] arguments that can be specified on the command line.
-
-    Parameters:
-        model_name_or_path (`Optional[str]`, *optional*, defaults to `None`):
-            Model checkpoint for weights initialization.
-        model_revision (`str`, *optional*, defaults to `"main"`):
-            Specific model version to use. It can be a branch name, a tag name, or a commit id.
-        torch_dtype (`Optional[Literal["auto", "bfloat16", 
-        "float16", "float32"]]`, *optional*, defaults to `None`):
-            Override the default `torch.dtype` and load the model under this dtype.
-            Possible values are
-
-                - `"bfloat16"`: `torch.bfloat16`
-                - `"float16"`: `torch.float16`
-                - `"float32"`: `torch.float32`
-                - `"auto"`: Automatically derive the dtype from the model's weights.
-
-        trust_remote_code (`bool`, *optional*, defaults to `False`):
-            Whether to allow for custom models defined on the Hub in their own modeling files.
-            This option should only
-            be set to `True` for repositories you trust and in which you have read the code,
-            as it will execute code
-            present on the Hub on your local machine.
-        attn_implementation (`Optional[str]`, *optional*, defaults to `None`):
-            Which attention implementation to use. You can run 
-            `--attn_implementation=flash_attention_2`,
-            in which case you must install this manually by running
-            `pip install flash-attn --no-build-isolation`.
-        use_peft (`bool`, *optional*, defaults to `False`):
-            Whether to use PEFT for training.
-        lora_r (`int`, *optional*, defaults to `16`):
-            LoRA R value.
-        lora_alpha (`int`, *optional*, defaults to `32`):
-            LoRA alpha.
-        lora_dropout (`float`, *optional*, defaults to `0.05`):
-            LoRA dropout.
-        lora_target_modules (`Optional[Union[str, List[str]]]`, *optional*, defaults to `None`):
-            LoRA target modules.
-        lora_modules_to_save (`Optional[List[str]]`, *optional*, defaults to `None`):
-            Model layers to unfreeze & train.
-        lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`):
-            Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling).
-        use_rslora (`bool`, *optional*, defaults to `False`):
-            Whether to use Rank-Stabilized LoRA, which sets the adapter 
-            scaling factor to `lora_alpha/√r`, instead of
-            the original default value of `lora_alpha/r`.
-        load_in_8bit (`bool`, *optional*, defaults to `False`):
-            Whether to use 8 bit precision for the base model. Works only with LoRA.
-        load_in_4bit (`bool`, *optional*, defaults to `False`):
-            Whether to use 4 bit precision for the base model. Works only with LoRA.
-        bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`):
-            Quantization type (`"fp4"` or `"nf4"`).
-        use_bnb_nested_quant (`bool`, *optional*, defaults to `False`):
-            Whether to use nested quantization.
-    """
-
-    model_name_or_path: Optional[str] = None
-    model_revision: str = "main"
-    torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None
-    trust_remote_code: bool = False
-    attn_implementation: Optional[str] = None
-    use_peft: bool = False
-    lora_r: int = 16
-    lora_alpha: int = 32
-    lora_dropout: float = 0.05
-    lora_target_modules: Optional[List[str]] = None
-    lora_modules_to_save: Optional[List[str]] = None
-    lora_task_type: str = "CAUSAL_LM"
-    use_rslora: bool = False
-    load_in_8bit: bool = False
-    load_in_4bit: bool = False
-    bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4"
-    use_bnb_nested_quant: bool = False
-
-    def __post_init__(self):
-        if self.load_in_8bit and self.load_in_4bit:
-            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
-
-        if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
-            self.lora_target_modules = self.lora_target_modules[0]
diff --git a/mindnlp/trl/trainer/utils.py b/mindnlp/trl/trainer/utils.py
deleted file mode 100644
index d3cb5eee8..000000000
--- a/mindnlp/trl/trainer/utils.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""trl trainer utils, only contents dpo related class and func."""
-import dataclasses
-import json
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import mindspore
-from mindnlp.core import no_grad
-from ..core import pad_sequence
-
-
-def pad(
-    tensors: List[mindspore.Tensor],
-    padding_value: int = 0,
-    padding_side: str = "right"
-) -> mindspore.Tensor:
-    """
-    Pad input function.
-    """
-    # Determine the maximum shape for each dimension
-    output_shape = np.max([t.shape for t in tensors], 0).tolist()
-
-    # Create an output tensor filled with the padding value
-    output = mindspore.ops.full(
-        (len(tensors), *output_shape),
-        padding_value,
-        dtype=tensors[0].dtype
-        )
-    for i, t in enumerate(tensors):
-        # Determine the slice for the sequence dimension
-        if padding_side == "left":
-            seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
-        elif padding_side == "right":
-            seq_slice = slice(0, t.shape[0])
-        else:
-            raise ValueError("padding_side must be 'left' or 'right'")
-
-        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
-        output[i][slices] = t
-
-    return output
-
-
-@dataclass
-class DPODataCollatorWithPadding:
-    r"""
-    DPO DataCollator class that pads the tokenized inputs to the maximum length of the batch.
-
-    Args:
-        pad_token_id (`int` defaults to 0):
-            The tokenizer's pad_token_id.
-        label_pad_token_id (`int`, defaults to -100):
-            The label used for masking.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            Whether or not you model has an encoder_decoder architecture.
-    """
-
-    pad_token_id: int = 0
-    label_pad_token_id: int = -100
-    is_encoder_decoder: Optional[bool] = False
-
-    def encoder_decoder_pad(self, features, k):
-        """When model is encoder-decoder arch, do pad."""
-        to_pad = [mindspore.Tensor(ex[k]) for ex in features]
-        if (k.startswith("prompt")) and (k.endswith("input_ids")):
-            if self.pad_token_id is None:
-                raise ValueError(
-                    "Padding is enabled, but the tokenizer is "
-                    " not configured with a padding token."
-                    " Explicitly set `tokenizer.pad_token` "
-                    "(e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
-                    " before calling the trainer."
-                )
-            padding_value = self.pad_token_id
-        elif k.endswith("_attention_mask"):
-            padding_value = 0
-        elif k.startswith(("chosen", "rejected", "completion")) or ("decoder" in k):
-            padding_value = self.label_pad_token_id
-        else:
-            raise ValueError(f"Unexpected key in batch '{k}'")
-        return to_pad, padding_value
-
-    def non_encoder_decoder_pad(self, features, k):
-        """when model is not encoder-decoder arch, do pad."""
-        # Set padding value based on the key
-        if k.endswith("_input_ids"):
-            if self.pad_token_id is None:
-                raise ValueError(
-                    "Padding is enabled, but the tokenizer is "
-                    "not configured with a padding token."
-                    " Explicitly set `tokenizer.pad_token` "
-                    "(e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
-                    " before calling the trainer."
-                )
-            padding_value = self.pad_token_id
-        elif k.endswith("_labels"):
-            padding_value = self.label_pad_token_id
-        elif k.endswith("_attention_mask"):
-            padding_value = 0
-        elif k.endswith("_pixel_values"):
-            padding_value = 0
-        else:
-            raise ValueError(f"Unexpected key in batch '{k}'")
-
-        # Set padding side based on the key
-        if k in ["prompt_input_ids", "prompt_attention_mask"]:
-            padding_side = "left"
-        else:
-            padding_side = "right"
-
-        # Set the dtype
-        if k.endswith("_pixel_values"):
-            dtype = mindspore.float32  # will be downcasted if necessary by the Trainer
-        else:
-            dtype = mindspore.int64
-        # Convert to tensor and pad
-        to_pad = [mindspore.tensor(ex[k], dtype=dtype).squeeze(0) for ex in features]
-        return to_pad, padding_value, padding_side
-
-    def __call__(self, features):
-        # first, pad everything to the same length
-        padded_batch = {}
-
-        for k in features[0].keys():
-            if k.endswith(("_input_ids", "_attention_mask", "_labels", "_pixel_values")):
-                if self.is_encoder_decoder:
-                    to_pad, padding_value = self.encoder_decoder_pad(features, k)
-                    padded_batch[k] = pad_sequence(
-                        to_pad,
-                        padding_value=padding_value
-                    )
-                else:
-                    to_pad, padding_value, padding_side = self.non_encoder_decoder_pad(
-                        features, k
-                    )
-                    padded_batch[k] = pad(
-                        to_pad,
-                        padding_value=padding_value,
-                        padding_side=padding_side
-                    )
-            elif k.endswith("_logps"):
-                # the cached reference model logprobs
-                padded_batch[k] = mindspore.tensor(features[k], dtype=mindspore.float32)
-            else:
-                padded_batch[k] = [ex[k] for ex in features]
-        return padded_batch
-
-
-@dataclass
-class RunningMoments:
-    """
-    Calculates the running mean and standard deviation of a data stream. Reference:
-    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L75
-    """
-    mean: float = 0
-    std: float = 1
-    var: float = 1
-    count: float = 1e-24
-
-    @no_grad()
-    def update(self, xs: mindspore.Tensor) -> Tuple[float, float]:
-        """
-        Updates running moments from batch's moments computed across ranks
-        """
-        # if self.accelerator.use_distributed:
-            # xs_mean, xs_var, xs_count = get_global_statistics(self.accelerator, xs)
-        # else:
-        xs_count = xs.numel()
-        xs_var, xs_mean = mindspore.ops.var_mean(xs)
-        xs_mean, xs_var = xs_mean.float(), xs_var.float()
-
-        delta = xs_mean - self.mean
-        tot_count = self.count + xs_count
-
-        new_sum = xs_var * xs_count
-        # correct old_sum deviation accounting for the new mean
-        old_sum = self.var * self.count + delta**2 * self.count * xs_count / tot_count
-        tot_sum = old_sum + new_sum
-
-        self.mean += (delta * xs_count / tot_count).item()
-        new_var = tot_sum / tot_count
-        self.std = (new_var * tot_count / (tot_count - 1)).float().sqrt().item()
-        self.var = new_var.item()
-        self.count = tot_count
-
-        return xs_mean.item(), (xs_var * xs_count / (xs_count - 1)).float().sqrt().item()
-
-    def save_to_json(self, json_path: str):
-        """Save the content of this instance in JSON format inside `json_path`."""
-        # save everything except accelerator
-        save_dict = dataclasses.asdict(
-            self,
-            dict_factory=lambda x: {k: v for (k, v) in x if k != "accelerator"}
-        )
-        json_string = json.dumps(save_dict, indent=2, sort_keys=True) + "\n"
-        with open(json_path, "w", encoding="utf-8") as f:
-            f.write(json_string)
-
-    @classmethod
-    def load_from_json(cls, accelerator, json_path: str):
-        """Create an instance from the content of `json_path`."""
-        # load everything except accelerator
-        with open(json_path, encoding="utf-8") as f:
-            text = f.read()
-        return cls(accelerator=accelerator, **json.loads(text))
-
-
-def add_bos_token_if_needed(
-    bos_token_id: Optional[int],
-    prompt_len_input_ids: int,
-    prompt_tokens: Dict[str, List[int]],
-    chosen_prompt_len_input_ids: int,
-    chosen_tokens: Dict[str, List[int]],
-    rejected_prompt_len_input_ids: int,
-    rejected_tokens: Dict[str, List[int]],
-):
-    """
-    Add BOS token if needed.
-    """
-    if bos_token_id is not None:
-        if prompt_len_input_ids == 0 or \
-            bos_token_id != prompt_tokens["prompt_input_ids"][0]:
-            prompt_tokens["prompt_input_ids"] = (
-                [bos_token_id] + prompt_tokens["prompt_input_ids"]
-            )
-            prompt_tokens["prompt_attention_mask"] = (
-                [1] + prompt_tokens["prompt_attention_mask"]
-            )
-        if chosen_prompt_len_input_ids == 0 or \
-            bos_token_id != chosen_tokens["prompt_input_ids"][0]:
-            chosen_tokens["prompt_input_ids"] = (
-                [bos_token_id] + chosen_tokens["prompt_input_ids"]
-            )
-            chosen_tokens["prompt_attention_mask"] = (
-                [1] + chosen_tokens["prompt_attention_mask"]
-            )
-        if rejected_prompt_len_input_ids == 0 or \
-            bos_token_id != rejected_tokens["prompt_input_ids"][0]:
-            rejected_tokens["prompt_input_ids"] = (
-                [bos_token_id] + rejected_tokens["prompt_input_ids"]
-            )
-            rejected_tokens["prompt_attention_mask"] = (
-                [1] + rejected_tokens["prompt_attention_mask"]
-            )
-    return prompt_tokens, chosen_tokens, rejected_tokens
-
-
-def add_eos_token_if_needed(
-    eos_token_id: int,
-    chosen_tokens: Dict[str, List[int]],
-    rejected_tokens: Dict[str, List[int]]
-):
-    """
-    Add EOS Token if needed.
-    """
-    if len(chosen_tokens["input_ids"]) == 0 or eos_token_id != chosen_tokens["input_ids"][-1]:
-        chosen_tokens["input_ids"].append(eos_token_id)
-        chosen_tokens["attention_mask"].append(1)
-    if len(rejected_tokens["input_ids"]) == 0 or eos_token_id != rejected_tokens["input_ids"][-1]:
-        rejected_tokens["input_ids"].append(eos_token_id)
-        rejected_tokens["attention_mask"].append(1)
-    return chosen_tokens, rejected_tokens
-
-
-def get_exp_cap(value, decimal=4):
-    """
-    Get the exponent cap of a value. This is used to cap the exponent of a value to avoid overflow.
-    The formula is : log(value.dtype.max)
-    E.g.
-      For float32 data type, the maximum exponent value is 88.7228 to 4 decimal points.
-    ```
-
-    Args:
-        value (`torch.Tensor`):
-            The input tensor to obtain the data type
-        decimal (`int`):
-            The number of decimal points of the output exponent cap.
-            eg: direct calling exp(log(torch.float32.max)) will result in inf
-            so we cap the exponent to 88.7228 to avoid overflow.
-    """
-    vdtype_max = mindspore.ops.zeros([1]).to(value.dtype)\
-         + mindspore.tensor(mindspore.dtype_to_nptype(np.finfo(value.dtype).max))
-    vdtype_log_max = mindspore.ops.log(vdtype_max).to(value.device)
-    if decimal > 0:
-        return mindspore.ops.floor(vdtype_log_max * 10**decimal) / 10**decimal
-    return vdtype_log_max
-
-
-def cap_exp(value, cap=-1):
-    """
-    Cap the exponent value below the upper-bound to avoid overflow,
-    before calling torch.exp
-    """
-    cap = get_exp_cap(value) if cap < 0 else cap
-    return mindspore.ops.exp(mindspore.ops.clamp(value, max=cap))
-
-
-def disable_dropout_in_model(model: mindspore.nn.Cell) -> None:
-    """
-    disable all dropout layer in given model.
-    """
-    for module in model.modules():
-        if isinstance(module, mindspore.nn.Dropout):
-            module.p = 0
-
-def pad_to_length(
-        tensor: mindspore.Tensor,
-        length: int,
-        pad_value: Union[int, float],
-        dim: int = -1
-    ) -> mindspore.Tensor:
-    """
-    pad all input tensor to the batch's maximal length.
-    """
-    if tensor.shape[dim] >= length:
-        return tensor
-    pad_size = list(tensor.shape)
-    pad_size[dim] = length - tensor.shape[dim]
-    return mindspore.ops.cat(
-        [
-            tensor,
-            pad_value * mindspore.ops.ones(tuple(pad_size), dtype=tensor.dtype),
-        ],
-        axis=dim,
-    )
-
-
-def peft_module_casting_to_bf16(model):
-    """
-    Make peft model modules into the same precision.
-    """
-    for name, module in model.named_modules():
-        if isinstance(module, mindspore.nn.LayerNorm) or "norm" in name:
-            module = module.to(mindspore.float32)
-        elif any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
-            if hasattr(module, "weight"):
-                if module.weight.dtype == mindspore.float32:
-                    module = module.to(mindspore.bfloat16)
diff --git a/mindnlp/utils/__init__.py b/mindnlp/utils/__init__.py
deleted file mode 100644
index d5e9be11b..000000000
--- a/mindnlp/utils/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""
-Common utils
-"""
-from .generic import *
-from .decompress import unzip, untar, ungz
-from .download import *
-from .compatibility import *
-from .chat_template_utils import *
-from .import_utils import requires_backends, is_mindspore_available, OptionalDependencyNotAvailable, is_sentencepiece_available, \
-is_tokenizers_available, direct_transformers_import, is_protobuf_available, is_safetensors_available, \
-is_cython_available, is_pretty_midi_available, is_essentia_available, is_librosa_available, is_scipy_available, is_pyctcdecode_available, \
-is_jieba_available, is_vision_available, is_sudachi_projection_available, is_g2p_en_available, is_levenshtein_available, is_nltk_available, \
-is_bs4_available, is_pytesseract_available, is_tiktoken_available, is_einops_available, is_faiss_available, is_datasets_available, \
-is_sacremoses_available, is_phonemizer_available,is_speech_available, is_kenlm_available, is_triton_available
-
-from .testing_utils import require_mindspore
-from .save import convert_file_size_to_int
-from .peft_utils import find_adapter_config_file
-
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-SENTENCEPIECE_UNDERLINE = "▁"
diff --git a/mindnlp/utils/backbone_utils.py b/mindnlp/utils/backbone_utils.py
deleted file mode 100644
index c4a6adbc6..000000000
--- a/mindnlp/utils/backbone_utils.py
+++ /dev/null
@@ -1,547 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Collection of utils to be used by backbones and their components."""
-
-import enum
-import inspect
-from typing import Iterable, List, Optional, Tuple, Union
-
-
-class BackboneType(enum.Enum):
-
-    r"""
-    Represents the types of backbone structures.
-    
-    This class inherits from enum.Enum and provides a set of backbone types that can be used in various applications.
-    """
-    MINDCV = "mindcv"
-    MINDNLP = "mindnlp"
-
-
-def verify_out_features_out_indices(
-    out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]]
-):
-    """
-    Verify that out_indices and out_features are valid for the given stage_names.
-    """
-    if stage_names is None:
-        raise ValueError("Stage_names must be set for transformers backbones")
-
-    if out_features is not None:
-        if not isinstance(out_features, (list,)):
-            raise ValueError(f"out_features must be a list got {type(out_features)}")
-        if any(feat not in stage_names for feat in out_features):
-            raise ValueError(f"out_features must be a subset of stage_names: {stage_names} got {out_features}")
-        if len(out_features) != len(set(out_features)):
-            raise ValueError(f"out_features must not contain any duplicates, got {out_features}")
-        sorted_feats = [feat for feat in stage_names if feat in out_features]
-        if out_features != sorted_feats:
-            raise ValueError(
-                f"out_features must be in the same order as stage_names, expected {sorted_feats} got {out_features}"
-            )
-
-    if out_indices is not None:
-        if not isinstance(out_indices, (list, tuple)):
-            raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}")
-        # Convert negative indices to their positive equivalent: [-1,] -> [len(stage_names) - 1,]
-        positive_indices = tuple(idx % len(stage_names) if idx < 0 else idx for idx in out_indices)
-        if any(idx for idx in positive_indices if idx not in range(len(stage_names))):
-            raise ValueError(f"out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
-        if len(positive_indices) != len(set(positive_indices)):
-            msg = f"out_indices must not contain any duplicates, got {out_indices}"
-            msg += f"(equivalent to {positive_indices}))" if positive_indices != out_indices else ""
-            raise ValueError(msg)
-        if positive_indices != tuple(sorted(positive_indices)):
-            sorted_negative = tuple(idx for _, idx in sorted(zip(positive_indices, out_indices), key=lambda x: x[0]))
-            raise ValueError(
-                f"out_indices must be in the same order as stage_names, expected {sorted_negative} got {out_indices}"
-            )
-
-    if out_features is not None and out_indices is not None:
-        if len(out_features) != len(out_indices):
-            raise ValueError("out_features and out_indices should have the same length if both are set")
-        if out_features != [stage_names[idx] for idx in out_indices]:
-            raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-
-def _align_output_features_output_indices(
-    out_features: Optional[List[str]],
-    out_indices: Optional[Union[List[int], Tuple[int]]],
-    stage_names: List[str],
-):
-    """
-    Finds the corresponding `out_features` and `out_indices` for the given `stage_names`.
-
-    The logic is as follows:
-
-    - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
-    `out_indices`.
-    - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
-    `out_features`.
-    - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
-    - `out_indices` and `out_features` set: input `out_indices` and `out_features` are returned.
-
-    Args:
-        out_features (`List[str]`): The names of the features for the backbone to output.
-        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
-        stage_names (`List[str]`): The names of the stages of the backbone.
-    """
-    if out_indices is None and out_features is None:
-        out_indices = [len(stage_names) - 1]
-        out_features = [stage_names[-1]]
-    elif out_indices is None and out_features is not None:
-        out_indices = [stage_names.index(layer) for layer in out_features]
-    elif out_features is None and out_indices is not None:
-        out_features = [stage_names[idx] for idx in out_indices]
-    return out_features, out_indices
-
-
-def get_aligned_output_features_output_indices(
-    out_features: Optional[List[str]],
-    out_indices: Optional[Union[List[int], Tuple[int]]],
-    stage_names: List[str],
-) -> Tuple[List[str], List[int]]:
-    """
-    Get the `out_features` and `out_indices` so that they are aligned.
-
-    The logic is as follows:
-
-    - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
-    `out_indices`.
-    - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
-    `out_features`.
-    - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
-    - `out_indices` and `out_features` set: they are verified to be aligned.
-
-    Args:
-        out_features (`List[str]`): The names of the features for the backbone to output.
-        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
-        stage_names (`List[str]`): The names of the stages of the backbone.
-    """
-    # First verify that the out_features and out_indices are valid
-    verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names)
-    output_features, output_indices = _align_output_features_output_indices(
-        out_features=out_features, out_indices=out_indices, stage_names=stage_names
-    )
-    # Verify that the aligned out_features and out_indices are valid
-    verify_out_features_out_indices(out_features=output_features, out_indices=output_indices, stage_names=stage_names)
-    return output_features, output_indices
-
-
-class BackboneMixin:
-    r"""
-    The `BackboneMixin` class represents a mixin for initializing backbone models used in computer vision and
-    natural language processing tasks. It provides methods for initializing the backbone, setting
-    output features and indices, accessing feature channels, and serializing the instance to a Python dictionary.
-
-    Attributes:
-        stage_names: A list of stage names in the backbone model.
-        num_features: A list of the number of channels for each stage in the backbone model.
-        out_features: A list of output features from the backbone model.
-        out_indices: A list of output indices from the backbone model.
-        out_feature_channels: A dictionary mapping stage names to the number of channels for each output feature.
-        channels: A list of the number of channels for each output feature.
-
-    Methods:
-        _init_timm_backbone: Initialize the backbone model from the 'timm' library.
-        _init_transformers_backbone: Initialize the backbone model for transformers.
-        _init_backbone: Initialize the backbone based on the specified type (MINDCV or MINDNLP).
-        forward_with_filtered_kwargs: Forward method with filtered keyword arguments.
-        forward: Forward method for processing input data.
-        to_dict: Serialize the instance to a Python dictionary, including the 'out_features' and 'out_indices' attributes.
-
-    Raises:
-        ValueError: If the backbone type is not supported.
-
-    Note:
-        This class is intended to be used as a mixin and should be inherited by other classes.
-    """
-    backbone_type: Optional[BackboneType] = None
-
-    def _init_timm_backbone(self, config) -> None:
-        """
-        Initialize the backbone model from timm The backbone must already be loaded to self._backbone
-        """
-        if getattr(self, "_backbone", None) is None:
-            raise ValueError("self._backbone must be set before calling _init_timm_backbone")
-
-        # These will diagree with the defaults for the transformers models e.g. for resnet50
-        # the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
-        # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
-        self.stage_names = [stage["module"] for stage in self._backbone.feature_info.info]
-        self.num_features = [stage["num_chs"] for stage in self._backbone.feature_info.info]
-        out_indices = self._backbone.feature_info.out_indices
-        out_features = self._backbone.feature_info.module_name()
-
-        # We verify the out indices and out features are valid
-        verify_out_features_out_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
-        )
-        self._out_features, self._out_indices = out_features, out_indices
-
-    def _init_transformers_backbone(self, config) -> None:
-        r"""
-        This method initializes the transformers backbone.
-
-        Args:
-            self (BackboneMixin): The instance of the BackboneMixin class.
-            config (object):
-                The configuration object containing the following attributes:
-
-                - stage_names (list): A list of stage names for the transformers backbone.
-                - out_features (list, optional): A list of output features. Defaults to None.
-                - out_indices (list, optional): A list of output indices. Defaults to None.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
-        stage_names = getattr(config, "stage_names")
-        out_features = getattr(config, "out_features", None)
-        out_indices = getattr(config, "out_indices", None)
-
-        self.stage_names = stage_names
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=stage_names
-        )
-        # Number of channels for each stage. This is set in the transformer backbone model init
-        self.num_features = None
-
-    def _init_backbone(self, config) -> None:
-        """
-        Method to initialize the backbone. This method is called by the forwardor of the base class after the
-        pretrained model weights have been loaded.
-        """
-        self.config = config
-
-        self.use_timm_backbone = getattr(config, "use_timm_backbone", False)
-        self.backbone_type = BackboneType.MINDCV if self.use_timm_backbone else BackboneType.MINDNLP
-
-        if self.backbone_type == BackboneType.MINDCV:
-            self._init_timm_backbone(config)
-        elif self.backbone_type == BackboneType.MINDNLP:
-            self._init_transformers_backbone(config)
-        else:
-            raise ValueError(f"backbone_type {self.backbone_type} not supported.")
-
-    @property
-    def out_features(self):
-        r"""
-        This method returns the value of the attribute 'out_features' in the BackboneMixin class.
-
-        Args:
-            self: An instance of the BackboneMixin class.
-
-        Returns:
-            None: This method returns the value of the attribute 'out_features', which is of type None.
-
-        Raises:
-            None
-        """
-        return self._out_features
-
-    @out_features.setter
-    def out_features(self, out_features: List[str]):
-        """
-        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
-        """
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=None, stage_names=self.stage_names
-        )
-
-    @property
-    def out_indices(self):
-        r"""
-        Retrieve the output indices from the BackboneMixin.
-
-        Args:
-            self (BackboneMixin): The instance of the BackboneMixin class.
-                It represents the current instance of the BackboneMixin.
-
-        Returns:
-            None: This method returns the output indices stored in the '_out_indices' attribute of the
-                BackboneMixin instance.
-
-        Raises:
-            None.
-        """
-        return self._out_indices
-
-    @out_indices.setter
-    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
-        """
-        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
-        """
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=None, out_indices=out_indices, stage_names=self.stage_names
-        )
-
-    @property
-    def out_feature_channels(self):
-        r"""
-        Returns a dictionary containing the number of feature channels for each stage in the backbone.
-
-        Args:
-            self (BackboneMixin): The instance of the class.
-
-        Returns:
-            dict: A dictionary where the keys represent the stages in the backbone and the values represent the
-                number of feature channels for each stage.
-
-        Raises:
-            None.
-
-        Example:
-            ```python
-            >>> backbone = BackboneMixin()
-            >>> backbone.out_feature_channels()
-            {'stage1': 64, 'stage2': 128, 'stage3': 256, 'stage4': 512}
-            ```
-        """
-        # the current backbones will output the number of channels for each stage
-        # even if that stage is not in the out_features list.
-        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
-
-    @property
-    def channels(self):
-        r"""
-        This method retrieves the feature channels from the BackboneMixin instance.
-        
-        Args:
-            self (BackboneMixin): The instance of the BackboneMixin class.
-            
-        Returns:
-            list: A list of feature channels corresponding to the out_features.
-        
-        Raises:
-            None
-        """
-        return [self.out_feature_channels[name] for name in self.out_features]
-
-    def forward_with_filtered_kwargs(self, *args, **kwargs):
-        """
-        Forward with Filtered Kwargs
-        
-        This method is defined in the 'BackboneMixin' class and is used to invoke the 'forward' method while
-        filtering the keyword arguments based on the parameters defined in the 'forward' method's signature.
-        
-        Args:
-            self: An instance of the 'BackboneMixin' class.
-        
-        Returns:
-            None.
-        
-        Raises:
-            None.
-        """
-        signature = dict(inspect.signature(self.forward).parameters)
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
-        return self(*args, **filtered_kwargs) # pylint: disable=not-callable
-
-    def forward(
-        self,
-        pixel_values,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        r"""
-        This method, named 'forward', is defined in the class 'BackboneMixin' and is responsible for performing a
-        forward pass through the network.
-        
-        Args:
-            self: The instance of the class.
-            pixel_values: A tensor containing the input pixel values.
-            output_hidden_states: (Optional) A boolean flag indicating whether to output the hidden states.
-                Defaults to None.
-            output_attentions: (Optional) A boolean flag indicating whether to output the attentions.
-                Defaults to None.
-            return_dict: (Optional) A boolean flag indicating whether to return a dictionary. Defaults to None.
-        
-        Returns:
-            None.
-        
-        Raises:
-            NotImplementedError: If the method is not implemented by the derived class.
-        """
-        raise NotImplementedError("This method should be implemented by the derived class.")
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
-        include the `out_features` and `out_indices` attributes.
-        """
-        output = super().to_dict()
-        output["out_features"] = output.pop("_out_features")
-        output["out_indices"] = output.pop("_out_indices")
-        return output
-
-
-class BackboneConfigMixin:
-    """
-    A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
-    """
-    @property
-    def out_features(self):
-        r"""
-        Method 'out_features' in the class 'BackboneConfigMixin'.
-        
-        Args:
-            self: object - The instance of the class.
-                The 'self' parameter refers to the instance of the class itself.
-                It is used to access and modify class attributes and methods.
-        
-        Returns:
-            `_out_features`: The method returns the value of the '_out_features' attribute of the class instance.
-        
-        Raises:
-            None.
-        """
-        return self._out_features
-
-    @out_features.setter
-    def out_features(self, out_features: List[str]):
-        """
-        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
-        """
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=None, stage_names=self.stage_names
-        )
-
-    @property
-    def out_indices(self):
-        r"""
-        Method 'out_indices' in the class 'BackboneConfigMixin'.
-        
-        Args:
-            self: BackboneConfigMixin object.
-                The instance of the BackboneConfigMixin class.
-        
-        Returns:
-            `_out_indices`: This method returns the '_out_indices' attribute of the instance.
-        
-        Raises:
-            None.
-        """
-        return self._out_indices
-
-    @out_indices.setter
-    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
-        """
-        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
-        """
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=None, out_indices=out_indices, stage_names=self.stage_names
-        )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
-        include the `out_features` and `out_indices` attributes.
-        """
-        output = super().to_dict()
-        output["out_features"] = output.pop("_out_features")
-        output["out_indices"] = output.pop("_out_indices")
-        return output
-
-backbone_map = {
-    'resnet18': 'microsoft/resnet-18',
-    'resnet50': 'microsoft/resnet-50'
-}
-
-def load_backbone(config):
-    """
-    Loads the backbone model from a config object.
-
-    If the config is from the backbone model itself, then we return a backbone model with randomly initialized
-    weights.
-
-    If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights
-    if specified.
-    """
-    from mindnlp.transformers import AutoBackbone, AutoConfig
-    backbone_config = getattr(config, "backbone_config", None)
-    use_timm_backbone = getattr(config, "use_timm_backbone", None)
-    use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None)
-    backbone_checkpoint = getattr(config, "backbone", None)
-    backbone_kwargs = getattr(config, "backbone_kwargs", None)
-    if use_timm_backbone and backbone_checkpoint is not None:
-        backbone_checkpoint = backbone_map[backbone_checkpoint]
-        use_timm_backbone = False
-
-    backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
-
-    if backbone_kwargs and backbone_config is not None:
-        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
-
-    # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
-    # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
-    if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None:
-        raise ValueError("Cannot specify both config.backbone_config and config.backbone")
-
-    # If any of thhe following are set, then the config passed in is from a model which contains a backbone.
-    if (
-        backbone_config is None
-        and use_timm_backbone is None
-        and backbone_checkpoint is None
-        and backbone_checkpoint is None
-    ):
-        return AutoBackbone.from_config(config=config, **backbone_kwargs)
-
-    # config from the parent model that has a backbone
-    if use_timm_backbone:
-        if backbone_checkpoint is None:
-            raise ValueError("config.backbone must be set if use_timm_backbone is True")
-        # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
-        # to determine whether to load the pretrained weights.
-        backbone = AutoBackbone.from_pretrained(
-            backbone_checkpoint,
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            **backbone_kwargs,
-        )
-    elif use_pretrained_backbone:
-        if backbone_checkpoint is None:
-            raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
-        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **backbone_kwargs)
-    else:
-        if backbone_config is None and backbone_checkpoint is None:
-            raise ValueError("Either config.backbone_config or config.backbone must be set")
-        if backbone_config is None:
-            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **backbone_kwargs)
-        backbone = AutoBackbone.from_config(config=backbone_config)
-    return backbone
-
-def verify_backbone_config_arguments(
-    use_timm_backbone: bool,
-    use_pretrained_backbone: bool,
-    backbone: Optional[str],
-    backbone_config: Optional[Union[dict, "PretrainedConfig"]],
-    backbone_kwargs: Optional[dict],
-):
-    """
-    Verify that the config arguments to be passed to load_backbone are valid
-    """
-    if backbone_config is not None and backbone is not None:
-        raise ValueError("You can't specify both `backbone` and `backbone_config`.")
-
-    if backbone_config is not None and use_timm_backbone:
-        raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
-
-    if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
-        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
diff --git a/mindnlp/utils/chat_template_utils.py b/mindnlp/utils/chat_template_utils.py
deleted file mode 100644
index 7ad66593c..000000000
--- a/mindnlp/utils/chat_template_utils.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""chat template utils"""
-import inspect
-import json
-import re
-from typing import Any, Callable, Dict, Optional, Tuple, Union, get_args, get_origin, get_type_hints
-
-
-BASIC_TYPES = (int, float, str, bool, Any, type(None), ...)
-# Extracts the initial segment of the docstring, containing the function description
-description_re = re.compile(r"^(.*?)[\n\s]*(Args:|Returns:|Raises:|\Z)", re.DOTALL)
-# Extracts the Args: block from the docstring
-args_re = re.compile(r"\n\s*Args:\n\s*(.*?)[\n\s]*(Returns:|Raises:|\Z)", re.DOTALL)
-# Splits the Args: block into individual arguments
-args_split_re = re.compile(
-    r"""
-(?:^|\n)  # Match the start of the args block, or a newline
-\s*(\w+):\s*  # Capture the argument name and strip spacing
-(.*?)\s*  # Capture the argument description, which can span multiple lines, and strip trailing spacing
-(?=\n\s*\w+:|\Z)  # Stop when you hit the next argument or the end of the block
-""",
-    re.DOTALL | re.VERBOSE,
-)
-# Extracts the Returns: block from the docstring, if present. Note that most chat templates ignore the return type/doc!
-returns_re = re.compile(r"\n\s*Returns:\n\s*(.*?)[\n\s]*(Raises:|\Z)", re.DOTALL)
-
-
-class TypeHintParsingException(Exception):
-    """Exception raised for errors in parsing type hints to generate JSON schemas"""
-
-
-class DocstringParsingException(Exception):
-    """Exception raised for errors in parsing docstrings to generate JSON schemas"""
-
-
-def _get_json_schema_type(param_type: str) -> Dict[str, str]:
-    type_mapping = {
-        int: {"type": "integer"},
-        float: {"type": "number"},
-        str: {"type": "string"},
-        bool: {"type": "boolean"},
-        Any: {},
-    }
-    return type_mapping.get(param_type, {"type": "object"})
-
-
-def _parse_type_hint(hint: str) -> Dict:
-    origin = get_origin(hint)
-    args = get_args(hint)
-
-    if origin is None:
-        try:
-            return _get_json_schema_type(hint)
-        except KeyError:
-            raise TypeHintParsingException(
-                "Couldn't parse this type hint, likely due to a custom class or object: ", hint
-            )
-
-    elif origin is Union:
-        # Recurse into each of the subtypes in the Union, except None, which is handled separately at the end
-        subtypes = [_parse_type_hint(t) for t in args if t is not type(None)]
-        if len(subtypes) == 1:
-            # A single non-null type can be expressed directly
-            return_dict = subtypes[0]
-        elif all(isinstance(subtype["type"], str) for subtype in subtypes):
-            # A union of basic types can be expressed as a list in the schema
-            return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
-        else:
-            # A union of more complex types requires "anyOf"
-            return_dict = {"anyOf": subtypes}
-        if type(None) in args:
-            return_dict["nullable"] = True
-        return return_dict
-
-    elif origin is list:
-        if not args:
-            return {"type": "array"}
-        else:
-            # Lists can only have a single type argument, so recurse into it
-            return {"type": "array", "items": _parse_type_hint(args[0])}
-
-    elif origin is tuple:
-        if not args:
-            return {"type": "array"}
-        if len(args) == 1:
-            raise TypeHintParsingException(
-                f"The type hint {str(hint).replace('typing.', '')} is a Tuple with a single element, which "
-                "we do not automatically convert to JSON schema as it is rarely necessary. If this input can contain "
-                "more than one element, we recommend "
-                "using a List[] type instead, or if it really is a single element, remove the Tuple[] wrapper and just "
-                "pass the element directly."
-            )
-        if ... in args:
-            raise TypeHintParsingException(
-                "Conversion of '...' is not supported in Tuple type hints. "
-                "Use List[] types for variable-length"
-                " inputs instead."
-            )
-        return {"type": "array", "prefixItems": [_parse_type_hint(t) for t in args]}
-
-    elif origin is dict:
-        # The JSON equivalent to a dict is 'object', which mandates that all keys are strings
-        # However, we can specify the type of the dict values with "additionalProperties"
-        out = {"type": "object"}
-        if len(args) == 2:
-            out["additionalProperties"] = _parse_type_hint(args[1])
-        return out
-
-    raise TypeHintParsingException("Couldn't parse this type hint, likely due to a custom class or object: ", hint)
-
-
-def _convert_type_hints_to_json_schema(func: Callable) -> Dict:
-    type_hints = get_type_hints(func)
-    signature = inspect.signature(func)
-    required = []
-    for param_name, param in signature.parameters.items():
-        if param.annotation == inspect.Parameter.empty:
-            raise TypeHintParsingException(f"Argument {param.name} is missing a type hint in function {func.__name__}")
-        if param.default == inspect.Parameter.empty:
-            required.append(param_name)
-
-    properties = {}
-    for param_name, param_type in type_hints.items():
-        properties[param_name] = _parse_type_hint(param_type)
-
-    schema = {"type": "object", "properties": properties}
-    if required:
-        schema["required"] = required
-
-    return schema
-
-
-def parse_google_format_docstring(docstring: str) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
-    """
-    Parses a Google-style docstring to extract the function description,
-    argument descriptions, and return description.
-
-    Args:
-        docstring (str): The docstring to parse.
-
-    Returns:
-        The function description, arguments, and return description.
-    """
-
-    # Extract the sections
-    description_match = description_re.search(docstring)
-    args_match = args_re.search(docstring)
-    returns_match = returns_re.search(docstring)
-
-    # Clean and store the sections
-    description = description_match.group(1).strip() if description_match else None
-    docstring_args = args_match.group(1).strip() if args_match else None
-    returns = returns_match.group(1).strip() if returns_match else None
-
-    # Parsing the arguments into a dictionary
-    if docstring_args is not None:
-        docstring_args = "\n".join([line for line in docstring_args.split("\n") if line.strip()])  # Remove blank lines
-        matches = args_split_re.findall(docstring_args)
-        args_dict = {match[0]: re.sub(r"\s*\n+\s*", " ", match[1].strip()) for match in matches}
-    else:
-        args_dict = {}
-
-    return description, args_dict, returns
-
-
-def get_json_schema(func: Callable) -> Dict:
-    """
-    This function generates a JSON schema for a given function, based on its docstring and type hints. This is
-    mostly used for passing lists of tools to a chat template. The JSON schema contains the name and description of
-    the function, as well as the names, types and descriptions for each of its arguments. `get_json_schema()` requires
-    that the function has a docstring, and that each argument has a description in the docstring, in the standard
-    Google docstring format shown below. It also requires that all the function arguments have a valid Python type hint.
-
-    Although it is not required, a `Returns` block can also be added, which will be included in the schema. This is
-    optional because most chat templates ignore the return value of the function.
-
-    Args:
-        func: The function to generate a JSON schema for.
-
-    Returns:
-        A dictionary containing the JSON schema for the function.
-
-    Examples:
-    ```python
-    >>> def multiply(x: float, y: float):
-    >>>    '''
-    >>>    A function that multiplies two numbers
-    >>>
-    >>>    Args:
-    >>>        x: The first number to multiply
-    >>>        y: The second number to multiply
-    >>>    '''
-    >>>    return x * y
-    >>>
-    >>> print(get_json_schema(multiply))
-    {
-        "name": "multiply",
-        "description": "A function that multiplies two numbers",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "x": {"type": "number", "description": "The first number to multiply"},
-                "y": {"type": "number", "description": "The second number to multiply"}
-            },
-            "required": ["x", "y"]
-        }
-    }
-    ```
-
-    The general use for these schemas is that they are used to generate tool descriptions for chat templates that
-    support them, like so:
-
-    ```python
-    >>> from transformers import AutoTokenizer
-    >>> from transformers.utils import get_json_schema
-    >>>
-    >>> def multiply(x: float, y: float):
-    >>>    '''
-    >>>    A function that multiplies two numbers
-    >>>
-    >>>    Args:
-    >>>        x: The first number to multiply
-    >>>        y: The second number to multiply
-    >>>    return x * y
-    >>>    '''
-    >>>
-    >>> multiply_schema = get_json_schema(multiply)
-    >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-    >>> messages = [{"role": "user", "content": "What is 179 x 4571?"}]
-    >>> formatted_chat = tokenizer.apply_chat_template(
-    >>>     messages,
-    >>>     tools=[multiply_schema],
-    >>>     chat_template="tool_use",
-    >>>     return_dict=True,
-    >>>     return_tensors="ms",
-    >>>     add_generation_prompt=True
-    >>> )
-    >>> # The formatted chat can now be passed to model.generate()
-    ```
-
-    Each argument description can also have an optional `(choices: ...)` block at the end, such as
-    `(choices: ["tea", "coffee"])`, which will be parsed into an `enum` field in the schema. Note that this will
-    only be parsed correctly if it is at the end of the line:
-
-    ```python
-    >>> def drink_beverage(beverage: str):
-    >>>    '''
-    >>>    A function that drinks a beverage
-    >>>
-    >>>    Args:
-    >>>        beverage: The beverage to drink (choices: ["tea", "coffee"])
-    >>>    '''
-    >>>    pass
-    >>>
-    >>> print(get_json_schema(drink_beverage))
-    ```
-    {
-        'name': 'drink_beverage',
-        'description': 'A function that drinks a beverage',
-        'parameters': {
-            'type': 'object',
-            'properties': {
-                'beverage': {
-                    'type': 'string',
-                    'enum': ['tea', 'coffee'],
-                    'description': 'The beverage to drink'
-                    }
-                },
-            'required': ['beverage']
-        }
-    }
-    """
-    doc = inspect.getdoc(func)
-    if not doc:
-        raise DocstringParsingException(
-            f"Cannot generate JSON schema for {func.__name__} because it has no docstring!"
-        )
-    doc = doc.strip()
-    main_doc, param_descriptions, return_doc = parse_google_format_docstring(doc)
-
-    json_schema = _convert_type_hints_to_json_schema(func)
-    if (return_dict := json_schema["properties"].pop("return", None)) is not None:
-        if return_doc is not None:  # We allow a missing return docstring since most templates ignore it
-            return_dict["description"] = return_doc
-    for arg, schema in json_schema["properties"].items():
-        if arg not in param_descriptions:
-            raise DocstringParsingException(
-                f"Cannot generate JSON schema for {func.__name__} because the docstring has no description for the argument '{arg}'"
-            )
-        desc = param_descriptions[arg]
-        enum_choices = re.search(r"\(choices:\s*(.*?)\)\s*$", desc, flags=re.IGNORECASE)
-        if enum_choices:
-            schema["enum"] = [c.strip() for c in json.loads(enum_choices.group(1))]
-            desc = enum_choices.string[: enum_choices.start()].strip()
-        schema["description"] = desc
-
-    output = {"name": func.__name__, "description": main_doc, "parameters": json_schema}
-    if return_dict is not None:
-        output["return"] = return_dict
-    return {"type": "function", "function": output}
diff --git a/mindnlp/utils/compatibility.py b/mindnlp/utils/compatibility.py
deleted file mode 100644
index cb43c2508..000000000
--- a/mindnlp/utils/compatibility.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""utils for mindspore backward compatibility."""
-import mindspore
-from packaging import version
-
-MIN_COMPATIBLE_VERSION = '1.8.1'
-MAX_GRAPH_FIRST_VERSION = '1.12.0'
-API_COMPATIBLE_VERSION = '1.10.1'
-
-MS_VERSION = mindspore.__version__
-MS_VERSION = MS_VERSION.replace('rc', '')
-
-less_min_minddata_compatible = version.parse(MS_VERSION) <= version.parse(MIN_COMPATIBLE_VERSION)
-less_min_compatible = version.parse(MS_VERSION) < version.parse(MIN_COMPATIBLE_VERSION)
-less_min_pynative_first = version.parse(MS_VERSION) <= version.parse(MAX_GRAPH_FIRST_VERSION)
-less_min_api_compatible = version.parse(MS_VERSION) <= version.parse(API_COMPATIBLE_VERSION)
-
-__all__ = [
-    'less_min_compatible',
-    'less_min_pynative_first',
-    'less_min_api_compatible',
-    'less_min_minddata_compatible'
-]
diff --git a/mindnlp/utils/decompress.py b/mindnlp/utils/decompress.py
deleted file mode 100644
index 8cd802b84..000000000
--- a/mindnlp/utils/decompress.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Decompress functions
-"""
-
-import os
-import tarfile
-import zipfile
-import gzip
-
-def untar(file_path: str, untar_path: str):
-    r"""
-    Untar tar.gz file
-
-    Args:
-        file_path (str): The path where the tgz file is located.
-        multiple (str): The directory where the files were unzipped.
-
-    Returns:
-        - **names** (list) -All filenames in the tar.gz file.
-
-    Raises:
-        TypeError: If `file_path` is not a string.
-        TypeError: If `untar_path` is not a string.
-
-    Examples:
-        >>> file_path = "./mindnlp/datasets/IWSLT2016/2016-01.tgz"
-        >>> untar_path = "./mindnlp/datasets/IWSLT2016"
-        >>> output = untar(file_path,untar_path)
-        >>> print(output[0])
-        '2016-01'
-
-    """
-    tar = tarfile.open(file_path)
-    names = tar.getnames()
-    for name in names:
-        if os.path.exists(os.path.join(untar_path, name)):
-            continue
-        tar.extract(name, untar_path)
-    tar.close()
-    return names
-
-
-def unzip(file_path: str, unzip_path: str):
-    r"""
-    Untar .zip file
-
-    Args:
-        file_path (str): The path where the .zip file is located.
-        unzip_path (str): The directory where the files were unzipped.
-
-    Returns:
-        - **names** (list) -All filenames in the .zip file.
-
-    Raises:
-        TypeError: If `file_path` is not a string.
-        TypeError: If `untar_path` is not a string.
-
-    """
-    zipf = zipfile.ZipFile(file_path, "r")
-    for name in zipf.namelist():
-        zipf.extract(name, unzip_path)
-    zipf.close()
-    return zipf.namelist()
-
-def ungz(file_path: str, unzip_path: str = None):
-    r"""
-    Untar .gz file
-
-    Args:
-        file_path (str): The path where the .gz file is located.
-        unzip_path (str): The directory where the files were unzipped.
-
-    Returns:
-        - **unzip_path** (str): The directory where the files were unzipped.
-
-    Raises:
-        TypeError: If `file_path` is not a string.
-        TypeError: If `untar_path` is not a string.
-
-    """
-    if not isinstance(unzip_path,str):
-        unzip_path = str(file_path)[:-3]
-    with open(unzip_path,'wb') as file:
-        gz_file = gzip.open(file_path, mode = 'rb')
-        file.write(gz_file.read())
-        gz_file.close()
-    return unzip_path
diff --git a/mindnlp/utils/download.py b/mindnlp/utils/download.py
deleted file mode 100644
index 299f8d201..000000000
--- a/mindnlp/utils/download.py
+++ /dev/null
@@ -1,1071 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Download functions
-"""
-
-import os
-import shutil
-import hashlib
-import re
-import json
-import types
-import functools
-import sys
-import tempfile
-import time
-from typing import Union, Optional, Dict, Any
-from pathlib import Path
-from urllib.parse import urlparse, parse_qs
-from tqdm.autonotebook import tqdm
-import requests
-from requests.exceptions import ProxyError, SSLError, HTTPError
-
-from mindnlp.configs import DEFAULT_ROOT, ENV_VARS_TRUE_VALUES, MINDNLP_CACHE, REPO_TYPES, HF_URL_BASE, \
-    HF_TOKEN, MS_URL_BASE
-from .errors import (
-    EntryNotFoundError,
-    LocalEntryNotFoundError,
-    RepositoryNotFoundError,
-    ModelNotFoundError,
-    GatedRepoError,
-    OfflineModeIsEnabled,
-    RevisionNotFoundError,
-    raise_for_status
-)
-from . import logging
-
-logger = logging.get_logger(__name__)
-
-_CACHED_NO_EXIST = object()
-_CACHED_NO_EXIST_T = Any
-
-_is_offline_mode = os.environ.get("MINDNLP_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES
-
-def is_offline_mode():
-    """
-    This function checks if the application is running in offline mode.
-    
-    Returns:
-        None
-    
-    """
-    return _is_offline_mode
-
-def is_remote_url(url_or_filename):
-    """
-    Args:
-        url_or_filename (str): The URL or filename to be checked for being a remote URL.
-        
-    Returns:
-        None: Returns None if the given URL is a remote URL (starts with 'http://' or 'https://').
-    
-    Raises:
-        N/A
-    """
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-def download_url(url, proxies=None):
-    """
-    Downloads a given url in a temporary file. This function is not safe to use in multiple processes. Its only use is
-    for deprecated behavior allowing to download config/models with a single url instead of using the Hub.
-
-    Args:
-        url (`str`): The url of the file to download.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-
-    Returns:
-        `str`: The location of the temporary file where the url was downloaded.
-    """
-    return threads_exclusive_http_get(url, tempfile.gettempdir(), download_file_name='tmp_' + url.split('/')[-1], proxies=proxies)
-
-def copy_func(f):
-    """Returns a copy of a function f."""
-    # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
-    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__, closure=f.__closure__)
-    g = functools.update_wrapper(g, f)
-    g.__kwdefaults__ = f.__kwdefaults__
-    return g
-
-def extract_filename_from_url(url):
-    """extract filename from url"""
-    parsed_url = urlparse(url)
-
-    path_segments = parsed_url.path.split('/')
-    file_from_path = path_segments[-1]
-
-    # for modelscope
-    query_params = parse_qs(parsed_url.query)
-    file_from_query = query_params.get('FilePath', [''])[0]
-
-    return file_from_query if file_from_query else file_from_path
-
-
-def get_cache_path():
-    r"""
-    Get the storage path of the default cache. If the environment 'cache_path' is set, use the environment variable.
-
-    Args:
-        None
-
-    Returns:
-        str, the path of default or the environment 'cache_path'.
-
-    Examples:
-        >>> default_cache_path = get_cache_path()
-        >>> print(default_cache_path)
-        '{home}\.mindnlp'
-ca
-    """
-    if "CACHE_DIR" in os.environ:
-        cache_dir = os.environ.get("CACHE_DIR")
-        if os.path.isdir(cache_dir):
-            return cache_dir
-        raise NotADirectoryError(
-            f"{os.environ['CACHE_DIR']} is not a directory.")
-    cache_dir = DEFAULT_ROOT
-
-    return cache_dir
-
-
-def threads_exclusive_http_get(url, storage_folder=None, md5sum=None, download_file_name=None, proxies=None, headers=None):
-    pointer_path = os.path.join(storage_folder, download_file_name)
-    lock_file_path = pointer_path + ".lock"
-    if sys.platform != "win32":
-        import fcntl # pylint: disable=import-error
-    else:
-        import winfcntlock as fcntl # pylint: disable=import-error
-    with open(lock_file_path, 'w') as lock_file:
-        fd = lock_file.fileno()
-        try:
-            fcntl.flock(fd, fcntl.LOCK_EX)
-            file_path = http_get(url, path=storage_folder, download_file_name=download_file_name, proxies=proxies, headers=headers)
-            return file_path
-        except Exception as exp:
-            raise exp
-        finally:
-            fcntl.flock(fd, fcntl.LOCK_UN)
-
-
-def http_get(url, path=None, md5sum=None, download_file_name=None, proxies=None, headers=None):
-    r"""
-    Download from given url, save to path.
-
-    Args:
-        url (str): download url
-        path (str): download to given path (default value: '{home}\.text')
-        md5sum (str): The true md5sum of download file.
-        download_file_name(str): The name of the downloaded file.\
-            (This para meter is required if the end of the link is not the downloaded file name.)
-        proxies (dict): a dict to identify proxies,for example: {"https": "https://127.0.0.1:7890"}.
-
-    Returns:
-        str, the path of default or the environment 'cache_path'.
-
-    Raises:
-        TypeError: If `url` is not a String.
-        RuntimeError: If `url` is None.
-
-    Examples:
-        >>> url = 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz'
-        >>> cache_path = http_get(url)
-        >>> print(cache_path)
-        ('{home}\.text', '{home}\aclImdb_v1.tar.gz')
-
-    """
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-    retry_cnt = 0
-    retry_limit = 5
-    chunk_size = 1024
-    total_size = 0
-
-    if download_file_name is None:
-        name = extract_filename_from_url(url)
-    else:
-        name = download_file_name
-
-    file_path = os.path.join(path, name)
-
-    # subfolder
-    if '/' in name and not os.path.exists(file_path[:file_path.rfind('/')]):
-        os.makedirs(file_path[:file_path.rfind('/')])
-
-    while not (os.path.exists(file_path) and check_md5(file_path, md5sum)):
-        # get downloaded size
-        tmp_file_path = file_path + "_tmp"
-        if os.path.exists(tmp_file_path):
-            file_size = os.path.getsize(tmp_file_path)
-            if file_size % chunk_size != 0:
-                file_size = 0
-            headers['Range'] = f'bytes={file_size}-'
-        else:
-            file_size = 0
-        req = requests.get(url, stream=True, timeout=10, proxies=proxies, headers=headers)
-
-        status = req.status_code
-
-        if status in (404, 500):
-            raise EntryNotFoundError(f"Can not found url: {url}")
-        if status == 401:
-            raise GatedRepoError('You should have authorization to access the model.')
-        if status == 429:
-            raise HTTPError('Too many requests.')
-        try:
-            if file_size == 0:
-                total_size = int(req.headers.get('content-length', 0))
-            else:
-                if int(req.headers.get('content-length', 0)) == total_size:
-                    total_size = int(req.headers.get('content-length', 0))
-                    file_size = 0
-                else:
-                    total_size = int(req.headers.get('content-length', 0)) + file_size
-
-            with open(tmp_file_path, "ab" if file_size != 0 else "wb") as file:
-                with tqdm(
-                    total=int(total_size), unit="B", initial=file_size, unit_scale=True, unit_divisor=1024
-                ) as pbar:
-                    for chunk in req.iter_content(chunk_size=chunk_size):
-                        if chunk:
-                            file.write(chunk)
-                            pbar.update(len(chunk))
-
-            shutil.move(tmp_file_path, file_path)
-        except requests.exceptions.RequestException as e:
-            if retry_cnt > retry_limit:
-                raise
-            print(f"Failed to download: {e}")
-            print(f"Retrying... (attempt {retry_cnt}/{retry_limit})")
-            time.sleep(1)  # Add a small delay before retrying
-
-        if retry_cnt < retry_limit:
-            retry_cnt += 1
-        else:
-            raise HTTPError(
-                f"Download from {url} failed. " "Retry limit reached. \n"
-                f"If you want to speedup the download, please use `AutoModel.from_pretrained('model_id', mirror='modelers')` instead.\n"
-                f'The optional mirrors can be ["modelers", "modelscope", "wisemodel", "gitee", "aifast"]')
-
-    return file_path
-
-
-def check_md5(filename: str, md5sum=None):
-    r"""
-    Check md5 of download file.
-
-    Args:
-        filename (str): The fullname of download file.
-        md5sum (str): The true md5sum of download file.
-
-    Returns:
-        bool, the md5 check result.
-
-    Raises:
-        TypeError: If `filename` is not a string.
-        RuntimeError: If `filename` is None.
-
-    Examples:
-        >>> filename = 'test'
-        >>> check_md5_result = check_md5(filename)
-        True
-
-    """
-    if md5sum is None:
-        return True
-
-    md5 = hashlib.md5()
-    with open(filename, "rb") as file:
-        for chunk in iter(lambda: file.read(4096), b""):
-            md5.update(chunk)
-    md5hex = md5.hexdigest()
-
-    if md5hex != md5sum:
-        return False
-    return True
-
-
-def get_filepath(path: str):
-    r"""
-    Get the filepath of file.
-
-    Args:
-        path (str): The path of the required file.
-
-    Returns:
-        - str, If `path` is a folder containing a file, return `{path}\{filename}`;
-          if `path` is a folder containing multiple files or a single file, return `path`.
-
-    Raises:
-        TypeError: If `path` is not a string.
-        RuntimeError: If `path` is None.
-
-    Examples:
-        >>> path = '{home}\.text'
-        >>> get_filepath_result = get_filepath(path)
-        >>> print(get_filepath_result)
-        '{home}\.text'
-
-    """
-    if os.path.isdir(path):
-        files = os.listdir(path)
-        if len(files) == 1:
-            return os.path.join(path, files[0])
-        return path
-    if os.path.isfile(path):
-        return path
-    raise FileNotFoundError(f"{path} is not a valid file or directory.")
-
-def get_file_from_repo(
-    path_or_repo: Union[str, os.PathLike],
-    filename: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    subfolder: str = "",
-):
-    """
-    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
-
-    Args:
-        path_or_repo (`str` or `os.PathLike`):
-            This can be either:
-
-            - a string, the *model id* of a model repo on hf-mirror.com.
-            - a path to a *directory* potentially containing the file.
-        filename (`str`):
-            The name of the file to locate in `path_or_repo`.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on hf-mirror.com, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-        subfolder (`str`, *optional*, defaults to `""`):
-            In case the relevant files are located inside a subfolder of the model repo on hf-mirror.com, you can
-            specify the folder name here.
-
-    <Tip>
-
-    Passing `token=True` is required when you want to use a private model.
-
-    </Tip>
-
-    Returns:
-        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo) or `None` if the
-        file does not exist.
-
-    Examples:
-
-    ```python
-    # Download a tokenizer configuration from hf-mirror.com and cache.
-    tokenizer_config = get_file_from_repo("google-bert/bert-base-uncased", "tokenizer_config.json")
-    # This model does not have a tokenizer config so the result will be None.
-    tokenizer_config = get_file_from_repo("FacebookAI/xlm-roberta-base", "tokenizer_config.json")
-    ```
-    """
-    return cached_file(
-        path_or_repo_id=path_or_repo,
-        filename=filename,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        token=token,
-        revision=revision,
-        local_files_only=local_files_only,
-        subfolder=subfolder,
-        _raise_exceptions_for_gated_repo=False,
-        _raise_exceptions_for_missing_entries=False,
-        _raise_exceptions_for_connection_errors=False,
-    )
-
-
-def cached_file(
-    path_or_repo_id: Union[str, os.PathLike],
-    filename: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    local_files_only: bool = False,
-    revision = 'main',
-    token = None,
-    subfolder: str = "",
-    mirror: str = 'huggingface',
-    repo_type: Optional[str] = None,
-    user_agent: Optional[Union[str, Dict[str, str]]] = None,
-    _raise_exceptions_for_gated_repo: bool = True,
-    _raise_exceptions_for_missing_entries: bool = True,
-    _raise_exceptions_for_connection_errors: bool = True,
-    _commit_hash: str = None,
-):
-    """
-    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
-
-    Args:
-        path_or_repo_id (`str` or `os.PathLike`):
-            This can be either:
-
-            - a string, the *model id* of a model repo on hf-mirror.com.
-            - a path to a *directory* potentially containing the file.
-        filename (`str`):
-            The name of the file to locate in `path_or_repo`.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-        subfolder (`str`, *optional*, defaults to `""`):
-            In case the relevant files are located inside a subfolder of the model repo on hf-mirror.com, you can
-            specify the folder name here.
-        repo_type (`str`, *optional*):
-            Specify the repo type (useful when downloading from a space for instance).
-
-    <Tip>
-
-    Passing `token=True` is required when you want to use a private model.
-
-    </Tip>
-
-    Returns:
-        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
-
-    Examples:
-
-    ```python
-    # Download a model weight from the Hub and cache it.
-    model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
-    ```"""
-    # Private arguments
-    #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
-    #         None.
-    #     _raise_exceptions_for_connection_errors: if False, do not raise an exception for connection errors but return
-    #         None.
-    #     _commit_hash: passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
-    #         a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
-    if is_offline_mode() and not local_files_only:
-        logger.info("Offline mode: forcing local_files_only=True")
-        local_files_only = True
-    if subfolder is None:
-        subfolder = ""
-
-    path_or_repo_id = str(path_or_repo_id)
-    full_filename = os.path.join(subfolder, filename)
-    if os.path.isdir(path_or_repo_id):
-        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename)
-        if not os.path.isfile(resolved_file):
-            if _raise_exceptions_for_missing_entries:
-                raise EnvironmentError(
-                    f"{path_or_repo_id} does not appear to have a file named {full_filename}."
-                )
-            return None
-        return resolved_file
-
-    if cache_dir is None:
-        cache_dir = MINDNLP_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if not force_download:
-        # If the file is cached under that commit hash, we return it directly.
-        resolved_file = try_to_load_from_cache(
-            path_or_repo_id, full_filename, cache_dir=cache_dir, repo_type=repo_type
-        )
-        if resolved_file is not None:
-            if resolved_file is not object():
-                return resolved_file
-            if not _raise_exceptions_for_missing_entries:
-                return None
-            raise EnvironmentError(f"Could not locate {full_filename} inside {path_or_repo_id}.")
-    try:
-        # Load from URL or cache if already cached
-        resolved_file = download(
-            path_or_repo_id,
-            filename,
-            subfolder=None if len(subfolder) == 0 else subfolder,
-            repo_type=repo_type,
-            cache_dir=cache_dir,
-            user_agent=user_agent,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            local_files_only=local_files_only,
-            revision=revision,
-            token=token,
-            mirror=mirror
-        )
-    except GatedRepoError as e:
-        if not _raise_exceptions_for_missing_entries:
-            return None
-        if resolved_file is not None or not _raise_exceptions_for_gated_repo:
-            return resolved_file
-        raise EnvironmentError(
-            "You are trying to access a gated repo.\nMake sure to have access to it."
-        ) from e
-    except RepositoryNotFoundError as e:
-        raise EnvironmentError(
-            f"{path_or_repo_id} is not a local folder and is nost a valid model identifier "
-        ) from e
-    except LocalEntryNotFoundError as e:
-        # We try to see if we have a cached version (not up to date):
-        resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir)
-        if resolved_file is not None and resolved_file != _CACHED_NO_EXIST:
-            return resolved_file
-        if not _raise_exceptions_for_missing_entries or not _raise_exceptions_for_connection_errors:
-            return None
-        raise EnvironmentError(
-            f"We couldn't load this file, couldn't find it in the"
-            f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
-            f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
-        ) from e
-    except EntryNotFoundError as e:
-        if not _raise_exceptions_for_missing_entries:
-            return None
-        raise EnvironmentError(
-            f"{path_or_repo_id} does not appear to have a file named {full_filename}."
-        ) from e
-
-    except HTTPError as err:
-        # First we try to see if we have a cached version (not up to date):
-        resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir)
-        if resolved_file is not None and resolved_file != object():
-            return resolved_file
-        if not _raise_exceptions_for_connection_errors:
-            return None
-
-        raise EnvironmentError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}") from err
-
-    return resolved_file
-
-
-def download(
-    repo_id: str,
-    filename: str,
-    *,
-    subfolder: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    cache_dir: Union[str, Path, None] = None,
-    local_dir: Union[str, Path, None] = None,
-    user_agent: Union[Dict, str, None] = None,
-    force_download: bool = False,
-    proxies: Optional[Dict] = None,
-    resume_download: bool = False,
-    local_files_only: bool = False,
-    revision: str = 'main',
-    token: str = None,
-    mirror: str = 'huggingface'
-) -> str:
-    """Download a given file if it's not already present in the local cache.
-    """
-    if cache_dir is None:
-        cache_dir = MINDNLP_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    if isinstance(local_dir, Path):
-        local_dir = str(local_dir)
-
-    if subfolder == "":
-        subfolder = None
-    if subfolder is not None:
-        # This is used to create a URL, and not a local path, hence the forward slash.
-        filename = f"{subfolder}/{filename}"
-
-    if repo_type is None:
-        repo_type = "model"
-    if repo_type not in REPO_TYPES:
-        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
-
-    storage_folder = os.path.join(cache_dir, repo_type, repo_id)
-    os.makedirs(storage_folder, exist_ok=True)
-
-    # cross platform transcription of filename, to be used as a local file path.
-    relative_filename = os.path.join(*filename.split("/"))
-    if os.name == "nt":
-        if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
-            raise ValueError(
-                f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
-                " owner to rename this file."
-            )
-
-    pointer_path = os.path.join(storage_folder, relative_filename)
-
-    if os.path.exists(pointer_path) and not force_download:
-        return pointer_path
-
-    url = build_download_url(repo_id, filename, revision, repo_type=repo_type, mirror=mirror)
-    token = HF_TOKEN if not token else token
-
-    headers = None
-    if token:
-        headers = {
-            'authorization': f"Bearer {token}",
-        }
-    else:
-        headers = {}
-    try:
-        pointer_path = threads_exclusive_http_get(url, storage_folder, download_file_name=relative_filename, proxies=proxies, headers=headers)
-    except Exception as exp:
-        # Otherwise, our Internet connection is down.
-        # etag is None
-        raise exp
-
-    return pointer_path
-
-# https://modelscope.cn/api/v1/models/mindnlp/THUDM_chatglm-6b/repo?Revision=master&FilePath=mindspore-00001-of-00008.ckpt
-
-def match_file(filename: str, cache_dir: str) -> str:
-    r"""
-    If there is the file in cache_dir, return the path; otherwise, return empty string or error.
-
-    Args:
-        filename (str): The name of the required file.
-        cache_dir (str): The path of save the file.
-
-    Returns:
-        - str, If there is the file in cache_dir, return filename;
-          if there is no such file, return empty string '';
-          if there are two or more matching file, report an error.
-
-    Raises:
-        TypeError: If `filename` is not a string.
-        TypeError: If `cache_dir` is not a string.
-        RuntimeError: If `filename` is None.
-        RuntimeError: If `cache_dir` is None.
-
-    Examples:
-        >>> name = 'aclImdb_v1.tar.gz'
-        >>> path = get_cache_path()
-        >>> match_file_result = match_file(name, path)
-
-    """
-    files = os.listdir(cache_dir)
-    matched_filenames = []
-    for file_name in files:
-        if re.match(filename + "$", file_name):
-            matched_filenames.append(file_name)
-    if not matched_filenames:
-        return ""
-    if len(matched_filenames) == 1:
-        return matched_filenames[-1]
-    raise RuntimeError(
-        f"Duplicate matched files:{matched_filenames}, this should be caused by a bug."
-    )
-
-
-def get_from_cache(
-    url: str, cache_dir: str = None, md5sum=None, download_file_name=None, proxies=None
-):
-    r"""
-    If there is the file in cache_dir, return the path; if there is no such file, use the url to download.
-
-    Args:
-        url (str): The path to download the file.
-        cache_dir (str): The path of save the file.
-        md5sum (str): The true md5sum of download file.
-        download_file_name(str): The name of the downloaded file.\
-            (This parameter is required if the end of the link is not the downloaded file name.)
-        proxies (dict): a dict to identify proxies,for example: {"https": "https://127.0.0.1:7890"}.
-
-    Returns:
-        - str, The path of save the downloaded file.
-        - str, The name of downloaded file.
-
-    Raises:
-        TypeError: If `url` is not a string.
-        TypeError: If `cache_dir` is not a Path.
-        RuntimeError: If `url` is None.
-
-    Examples:
-        >>> path = "https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz"
-        >>> path, filename = cached_path(path)
-        >>> print(path, filename)
-        '{home}\.text' 'aclImdb_v1.tar.gz'
-
-    """
-    if cache_dir is None:
-        raise ValueError('cache dir should not be None.')
-
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-
-    if download_file_name is None:
-        filename = extract_filename_from_url(url)
-    else:
-        filename = download_file_name
-
-    file_path = os.path.join(cache_dir, filename)
-
-    if os.path.exists(file_path) and check_md5(file_path, md5sum):
-        return file_path
-    try:
-        path = threads_exclusive_http_get(url, cache_dir, md5sum, download_file_name=filename, proxies=proxies)
-        return path
-    except (ProxyError, SSLError) as exc:
-        raise exc
-    except ModelNotFoundError:
-        return None
-
-def try_to_load_from_cache(
-    repo_id: str,
-    filename: str,
-    cache_dir: Union[str, Path, None] = None,
-    revision: Optional[str] = None,
-    repo_type: Optional[str] = None,
-) -> Union[str, _CACHED_NO_EXIST_T, None]:
-    """
-    Explores the cache to return the latest cached file for a given revision if found.
-
-    This function will not raise any exception if the file in not cached.
-
-    Args:
-        cache_dir (`str` or `os.PathLike`):
-            The folder where the cached files lie.
-        repo_id (`str`):
-            The ID of the repo on hf-mirror.com.
-        filename (`str`):
-            The filename to look for inside `repo_id`.
-        revision (`str`, *optional*):
-            The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
-            provided either.
-        repo_type (`str`, *optional*):
-            The type of the repository. Will default to `"model"`.
-
-    Returns:
-        `Optional[str]` or `_CACHED_NO_EXIST`:
-            Will return `None` if the file was not cached. Otherwise:
-            - The exact path to the cached file if it's found in the cache
-            - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
-              cached.
-
-    Example:
-
-    ```python
-    from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
-
-    filepath = try_to_load_from_cache()
-    if isinstance(filepath, str):
-        # file exists and is cached
-        ...
-    elif filepath is _CACHED_NO_EXIST:
-        # non-existence of file is cached
-        ...
-    else:
-        # file is not cached
-        ...
-    ```
-    """
-    if revision is None:
-        revision = "main"
-    if repo_type is None:
-        repo_type = "model"
-    if repo_type not in REPO_TYPES:
-        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
-    if cache_dir is None:
-        cache_dir = MINDNLP_CACHE
-
-    repo_cache = os.path.join(cache_dir, f"{repo_type}/{repo_id}")
-    if not os.path.isdir(repo_cache):
-        # No cache for this model
-        return None
-
-    # Check if file exists in cache
-    cache_file = os.path.join(repo_cache, filename)
-    return cache_file if os.path.isfile(cache_file) else None
-
-
-def get_checkpoint_shard_files(
-    pretrained_model_name_or_path,
-    index_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    local_files_only=False,
-    revision='main',
-    token=None,
-    user_agent=None,
-    subfolder="",
-    mirror='huggingface'
-):
-    """
-    For a given model:
-
-    - download and cache all the shards of a sharded checkpoint if `pretrained_model_name_or_path` is a model ID on the
-      Hub
-    - returns the list of paths to all the shards, as well as some metadata.
-
-    For the description of each arg, see [`PreTrainedModel.from_pretrained`]. `index_filename` is the full path to the
-    index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub).
-    """
-    if not os.path.isfile(index_filename):
-        raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
-
-    with open(index_filename, "r") as f:
-        index = json.loads(f.read())
-
-    shard_filenames = sorted(set(index["weight_map"].values()))
-    sharded_metadata = index["metadata"]
-    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
-    sharded_metadata["weight_map"] = index["weight_map"].copy()
-
-    # First, let's deal with local folder.
-    if os.path.isdir(pretrained_model_name_or_path):
-        shard_filenames = [os.path.join(pretrained_model_name_or_path, subfolder, f) for f in shard_filenames]
-        return shard_filenames, sharded_metadata
-
-    # At this stage pretrained_model_name_or_path is a model identifier on the Hub
-    cached_filenames = []
-    # Check if the model is already cached or not. We only try the last checkpoint, this should cover most cases of
-    # downloaded (if interrupted).
-    last_shard = try_to_load_from_cache(
-        pretrained_model_name_or_path, shard_filenames[-1], cache_dir=cache_dir
-    )
-    show_progress_bar = last_shard is None or force_download
-    for shard_filename in tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar):
-        try:
-            # Load from URL
-            cached_filename = cached_file(
-                pretrained_model_name_or_path,
-                shard_filename,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                user_agent=user_agent,
-                subfolder=subfolder,
-                revision=revision,
-                token=token,
-                mirror=mirror
-            )
-        # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
-        # we don't have to catch them here.
-        except EntryNotFoundError as exc:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {shard_filename} which is "
-                "required according to the checkpoint index."
-            ) from exc
-        except HTTPError as exc:
-            raise EnvironmentError(
-                f"We couldn't load {shard_filename}. You should try"
-                " again after checking your internet connection."
-            ) from exc
-
-        cached_filenames.append(cached_filename)
-
-    return cached_filenames, sharded_metadata
-
-MIRROR_MAP = {
-    'huggingface': HF_URL_BASE,
-    'modelscope': MS_URL_BASE,
-    'wisemodel': "https://awsdownload.wisemodel.cn/file-proxy/{}/-/raw/{}/{}",
-    'gitee': "https://ai.gitee.com/huggingface/{}/resolve/{}/{}",
-    'aifast': "https://aifasthub.com/models/{}/{}",
-    'modelers': "https://modelers.cn/coderepo/web/v1/file/{}/{}/media/{}"
-}
-
-def build_download_url(
-    repo_id: str,
-    filename: str,
-    revision: str,
-    *,
-    subfolder: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    mirror: str = 'huggingface'
-) -> str:
-    """Construct the URL of a file from the given information.
-    """
-    if revision is None:
-        revision = 'main'
-    if mirror not in MIRROR_MAP:
-        raise ValueError('The mirror name not support, please use one of the mirror website below: '
-                         '["huggingface", "modelscope", "wisemodel", "gitee", "aifast", "modelers"]')
-    if mirror in ('huggingface', 'gitee', 'modelscope', 'wisemodel', 'modelers'):
-        if mirror == 'modelscope' and revision == 'main':
-            revision = 'master'
-        return MIRROR_MAP[mirror].format(repo_id, revision, filename)
-    if revision is not None and revision != 'main':
-        logger.warning(f'`revision` is not support when use "{mirror}" website. '
-                    f'If you want use specific revision, please use "modelscope", "huggingface" or "gitee".')
-    return MIRROR_MAP[mirror].format(repo_id, filename)
-
-
-REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
-
-def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]) -> Optional[str]:
-    """
-    Extracts the commit hash from a resolved filename toward a cache file.
-    """
-    if resolved_file is None or commit_hash is not None:
-        return commit_hash
-    resolved_file = str(Path(resolved_file).as_posix())
-    search = re.search(r"snapshots/([^/]+)/", resolved_file)
-    if search is None:
-        return None
-    commit_hash = search.groups()[0]
-    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
-
-def has_file(
-    path_or_repo: Union[str, os.PathLike],
-    filename: str,
-    revision: Optional[str] = None,
-    proxies: Optional[Dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    mirror: str = 'huggingface',
-    *,
-    local_files_only: bool = False,
-    cache_dir: Union[str, Path, None] = None,
-    repo_type: Optional[str] = None,
-    **deprecated_kwargs,
-):
-    """
-    Checks if a repo contains a given file without downloading it. Works for remote repos and local folders.
-
-    If offline mode is enabled, checks if the file exists in the cache.
-
-    <Tip warning={false}>
-
-    This function will raise an error if the repository `path_or_repo` is not valid or if `revision` does not exist for
-    this repo, but will return False for regular connection errors.
-
-    </Tip>
-    """
-
-    # If path to local directory, check if the file exists
-    if os.path.isdir(path_or_repo):
-        return os.path.isfile(os.path.join(path_or_repo, filename))
-
-    # Else it's a repo => let's check if the file exists in local cache or on the Hub
-
-    # Check if file exists in cache
-    # This information might be outdated so it's best to also make a HEAD call (if allowed).
-    cached_path = try_to_load_from_cache(
-        repo_id=path_or_repo,
-        filename=filename,
-        revision=revision,
-        repo_type=repo_type,
-        cache_dir=cache_dir,
-    )
-    has_file_in_cache = isinstance(cached_path, str)
-
-    # If local_files_only, don't try the HEAD call
-    if local_files_only:
-        return has_file_in_cache
-
-    # Check if the file exists
-    try:
-        url = build_download_url(path_or_repo, filename, revision, repo_type=repo_type, mirror=mirror)
-        if token:
-            headers = {
-                'authorization': f"Bearer {token}",
-            }
-        else:
-            headers = {}
-        response = requests.head(url, timeout=10, allow_redirects=False, proxies=proxies, headers=headers)
-
-    except OfflineModeIsEnabled:
-        return has_file_in_cache
-
-    try:
-        raise_for_status(response)
-        return True
-    except GatedRepoError as e:
-        logger.error(e)
-        raise EnvironmentError(
-            f"{path_or_repo} is a gated repository. Make sure to request access at "
-            f"https://huggingface.co/{path_or_repo} and pass a token having permission to this repo either by "
-            "logging in with `huggingface-cli login` or by passing `token=<your_token>`."
-        ) from e
-    except RepositoryNotFoundError as e:
-        logger.error(e)
-        raise EnvironmentError(
-            f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'."
-        ) from e
-    except RevisionNotFoundError as e:
-        logger.error(e)
-        raise EnvironmentError(
-            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
-            f"model name. Check the model page at 'https://huggingface.co/{path_or_repo}' for available revisions."
-        ) from e
-    except EntryNotFoundError:
-        return False  # File does not exist
-    except requests.HTTPError:
-        # Any authentication/authorization error will be caught here => default to cache
-        return has_file_in_cache
-
-def convert_file_size_to_int(size: Union[int, str]):
-    """
-    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
-
-    Args:
-        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
-
-    Example:
-    ```py
-    >>> convert_file_size_to_int("1MiB")
-    1048576
-    ```
-    """
-    if isinstance(size, int):
-        return size
-    if size.upper().endswith("GIB"):
-        return int(size[:-3]) * (2**30)
-    if size.upper().endswith("MIB"):
-        return int(size[:-3]) * (2**20)
-    if size.upper().endswith("KIB"):
-        return int(size[:-3]) * (2**10)
-    if size.upper().endswith("GB"):
-        int_size = int(size[:-2]) * (10**9)
-        return int_size // 8 if size.endswith("b") else int_size
-    if size.upper().endswith("MB"):
-        int_size = int(size[:-2]) * (10**6)
-        return int_size // 8 if size.endswith("b") else int_size
-    if size.upper().endswith("KB"):
-        int_size = int(size[:-2]) * (10**3)
-        return int_size // 8 if size.endswith("b") else int_size
-    raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
diff --git a/mindnlp/utils/errors.py b/mindnlp/utils/errors.py
deleted file mode 100644
index d02b9294b..000000000
--- a/mindnlp/utils/errors.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-MindNLP defined Errors.
-"""
-from typing import Optional
-from requests import HTTPError, Response
-from requests import JSONDecodeError
-
-class OfflineModeIsEnabled(ConnectionError):
-    """Raised when a request is made but `HF_HUB_OFFLINE=1` is set as environment variable."""
-
-class MSHTTPError(HTTPError):
-    """
-    HTTPError to inherit from for any custom HTTP Error raised in MindNLP.
-
-    Any HTTPError is converted at least into a `MSHTTPError`. If some information is
-    sent back by the server, it will be added to the error message.
-
-    Added details:
-    - Request id from "X-Request-Id" header if exists.
-    - Server error message from the header "X-Error-Message".
-    - Server error message if we can found one in the response body.
-    """
-    request_id: Optional[str] = None
-    server_message: Optional[str] = None
-
-    def __init__(self, message: str, response: Optional[Response] = None):
-        """
-        Initializes an instance of MSHTTPError.
-        
-        Args:
-            self: The instance of the MSHTTPError class.
-            message (str): The error message associated with the HTTP error.
-            response (Optional[Response]): The optional response object received during the HTTP request. Defaults to None.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            JSONDecodeError: If an error occurs while decoding the JSON response.
-        """
-        # Parse server information if any.
-        if response is not None:
-            self.request_id = response.headers.get("X-Request-Id")
-            try:
-                server_data = response.json()
-            except JSONDecodeError:
-                server_data = {}
-
-            # Retrieve server error message from multiple sources
-            server_message_from_headers = response.headers.get("X-Error-Message")
-            server_message_from_body = server_data.get("error")
-            server_multiple_messages_from_body = "\n".join(
-                error["message"] for error in server_data.get("errors", []) if "message" in error
-            )
-
-            # Concatenate error messages
-            _server_message = ""
-            if server_message_from_headers is not None:  # from headers
-                _server_message += server_message_from_headers + "\n"
-            if server_message_from_body is not None:  # from body "error"
-                if isinstance(server_message_from_body, list):
-                    server_message_from_body = "\n".join(server_message_from_body)
-                if server_message_from_body not in _server_message:
-                    _server_message += server_message_from_body + "\n"
-            if server_multiple_messages_from_body is not None:  # from body "errors"
-                if server_multiple_messages_from_body not in _server_message:
-                    _server_message += server_multiple_messages_from_body + "\n"
-            _server_message = _server_message.strip()
-
-            # Set message to `MSHTTPError` (if any)
-            if _server_message != "":
-                self.server_message = _server_message
-
-        super().__init__(
-            _format_error_message(
-                message,
-                request_id=self.request_id,
-                server_message=self.server_message,
-            ),
-            response=response,
-        )
-
-    def append_to_message(self, additional_message: str) -> None:
-        """Append additional information to the `MSHTTPError` initial message."""
-        self.args = (self.args[0] + additional_message,) + self.args[1:]
-
-
-class ModelNotFoundError(MSHTTPError):
-    """
-    Raised when trying to access a hf.co URL with an invalid repository name, or
-    with a private repo name the user does not have access to.
-    """
-class RepositoryNotFoundError(MSHTTPError):
-    """
-    Raised when trying to access a hf.co URL with an invalid repository name, or
-    with a private repo name the user does not have access to.
-    """
-class GatedRepoError(RepositoryNotFoundError):
-    """
-    Raised when trying to access a gated repository for which the user is not on the
-    authorized list.
-
-    Note: derives from `RepositoryNotFoundError` to ensure backward compatibility.
-
-    Example:
-
-    ```py
-    >>> from huggingface_hub import model_info
-    >>> model_info("<gated_repository>")
-    (...)
-    huggingface_hub.utils._errors.GatedRepoError: 403 Client Error. (Request ID: ViT1Bf7O_026LGSQuVqfa)
-
-    Cannot access gated repo for url https://hf-mirror.com/api/models/ardent-figment/gated-model.
-    Access to model ardent-figment/gated-model is restricted and you are not in the authorized list.
-    Visit https://hf-mirror.com/ardent-figment/gated-model to ask for access.
-    ```
-    """
-class EntryNotFoundError(MSHTTPError):
-    """
-    Raised when trying to access a hf.co URL with a valid repository and revision
-    but an invalid filename.
-
-    Example:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-    >>> hf_hub_download('bert-base-cased', '<non-existent-file>')
-    (...)
-    huggingface_hub.utils._errors.EntryNotFoundError: 404 Client Error. (Request ID: 53pNl6M0MxsnG5Sw8JA6x)
-
-    Entry Not Found for url: https://hf-mirror.com/bert-base-cased/resolve/main/%3Cnon-existent-file%3E.
-    ```
-    """
-class LocalEntryNotFoundError(EntryNotFoundError, FileNotFoundError, ValueError):
-    """
-    Raised when trying to access a file that is not on the disk when network is
-    disabled or unavailable (connection issue). The entry may exist on the Hub.
-
-    Note: `ValueError` type is to ensure backward compatibility.
-    Note: `LocalEntryNotFoundError` derives from `HTTPError` because of `EntryNotFoundError`
-          even when it is not a network issue.
-    """
-    def __init__(self, message: str):
-        """Initialize a LocalEntryNotFoundError object.
-        
-        Args:
-            self (LocalEntryNotFoundError): The instance of the LocalEntryNotFoundError class.
-            message (str): The error message associated with the exception.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        super().__init__(message, response=None)
-
-
-class BadRequestError(MSHTTPError, ValueError):
-    """
-    Raised by `raise_for_status` when the server returns a HTTP 400 error.
-
-    Example:
-
-    ```py
-    >>> resp = requests.post("hf.co/api/check", ...)
-    >>> raise_for_status(resp, endpoint_name="check")
-    huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX)
-    ```
-    """
-
-class RevisionNotFoundError(MSHTTPError):
-    """
-    Raised when trying to access a hf.co URL with a valid repository but an invalid
-    revision.
-
-    Example:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-    >>> hf_hub_download('bert-base-cased', 'config.json', revision='<non-existent-revision>')
-    (...)
-    huggingface_hub.utils._errors.RevisionNotFoundError: 404 Client Error. (Request ID: Mwhe_c3Kt650GcdKEFomX)
-
-    Revision Not Found for url: https://huggingface.co/bert-base-cased/resolve/%3Cnon-existent-revision%3E/config.json.
-    ```
-    """
-
-def raise_for_status(response: Response, endpoint_name: Optional[str] = None) -> None:
-    """
-    Internal version of `response.raise_for_status()` that will refine a
-    potential HTTPError. Raised exception will be an instance of `MSHTTPError`.
-
-    This helper is meant to be the unique method to raise_for_status when making a call
-    to the Hugging Face Hub.
-
-    Example:
-    ```py
-        import requests
-        from huggingface_hub.utils import get_session, raise_for_status, MSHTTPError
-
-        response = get_session().post(...)
-        try:
-            raise_for_status(response)
-        except MSHTTPError as e:
-            print(str(e)) # formatted message
-            e.request_id, e.server_message # details returned by server
-
-            # Complete the error message with additional information once it's raised
-            e.append_to_message("\n`create_commit` expects the repository to exist.")
-            raise
-    ```
-
-    Args:
-        response (`Response`):
-            Response from the server.
-        endpoint_name (`str`, *optional*):
-            Name of the endpoint that has been called. If provided, the error message
-            will be more complete.
-
-    <Tip warning={true}>
-
-    Raises when the request has failed:
-
-        - [`~utils.RepositoryNotFoundError`]
-            If the repository to download from cannot be found. This may be because it
-            doesn't exist, because `repo_type` is not set correctly, or because the repo
-            is `private` and you do not have access.
-        - [`~utils.GatedRepoError`]
-            If the repository exists but is gated and the user is not on the authorized
-            list.
-        - [`~utils.RevisionNotFoundError`]
-            If the repository exists but the revision couldn't be find.
-        - [`~utils.EntryNotFoundError`]
-            If the repository exists but the entry (e.g. the requested file) couldn't be
-            find.
-        - [`~utils.BadRequestError`]
-            If request failed with a HTTP 400 BadRequest error.
-        - [`~utils.MSHTTPError`]
-            If request failed for a reason not listed above.
-
-    </Tip>
-    """
-    try:
-        response.raise_for_status()
-    except HTTPError as exc:
-        error_code = response.headers.get("X-Error-Code")
-
-        if error_code == "EntryNotFound":
-            message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}."
-            raise EntryNotFoundError(message, response) from exc
-
-        if error_code == "GatedRepo":
-            message = (
-                f"{response.status_code} Client Error." + "\n\n" + f"Cannot access gated repo for url {response.url}."
-            )
-            raise GatedRepoError(message, response) from exc
-
-        if (
-            response.status_code == 401
-            and response.request.url is not None
-            and "/api/collections" in response.request.url
-        ):
-            # Collection not found. We don't raise a custom error for this.
-            # This prevent from raising a misleading `RepositoryNotFoundError` (see below).
-            pass
-
-        if (
-            response.status_code == 401
-            and response.request.url is not None
-        ):
-            # Not enough permission to list Inference Endpoints from this org. We don't raise a custom error for this.
-            # This prevent from raising a misleading `RepositoryNotFoundError` (see below).
-            pass
-
-        if error_code == "RepoNotFound" or response.status_code == 401:
-            # 401 is misleading as it is returned for:
-            #    - private and gated repos if user is not authenticated
-            #    - missing repos
-            # => for now, we process them as `RepoNotFound` anyway.
-            # See https://gist.github.com/Wauplin/46c27ad266b15998ce56a6603796f0b9
-            message = (
-                f"{response.status_code} Client Error."
-                + "\n\n"
-                + f"Repository Not Found for url: {response.url}."
-                + "\nPlease make sure you specified the correct `repo_id` and"
-                " `repo_type`.\nIf you are trying to access a private or gated repo,"
-                " make sure you are authenticated."
-            )
-            raise RepositoryNotFoundError(message, response) from exc
-
-        if response.status_code == 400:
-            message = (
-                f"\n\nBad request for {endpoint_name} endpoint:" if endpoint_name is not None else "\n\nBad request:"
-            )
-            raise BadRequestError(message, response=response) from exc
-
-        # Convert `HTTPError` into a `MSHTTPError` to display request information
-        # as well (request id and/or server error message)
-        raise MSHTTPError(str(exc), response=response) from exc
-
-
-def _format_error_message(message: str, request_id: Optional[str], server_message: Optional[str]) -> str:
-    """
-    Format the `MSHTTPError` error message based on initial message and information
-    returned by the server.
-
-    Used when initializing `MSHTTPError`.
-    """
-    # Add message from response body
-    if server_message is not None and len(server_message) > 0 and server_message.lower() not in message.lower():
-        if "\n\n" in message:
-            message += "\n" + server_message
-        else:
-            message += "\n\n" + server_message
-
-    # Add Request ID
-    if request_id is not None and str(request_id).lower() not in message.lower():
-        request_id_message = f" (Request ID: {request_id})"
-        if "\n" in message:
-            newline_index = message.index("\n")
-            message = message[:newline_index] + request_id_message + message[newline_index:]
-        else:
-            message += request_id_message
-
-    return message
diff --git a/mindnlp/utils/generic.py b/mindnlp/utils/generic.py
deleted file mode 100644
index 2d6c8422a..000000000
--- a/mindnlp/utils/generic.py
+++ /dev/null
@@ -1,568 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Generic utils.
-"""
-import inspect
-from enum import Enum
-from collections import OrderedDict, UserDict
-from dataclasses import fields
-from typing import Any, Tuple, ContextManager, List
-from contextlib import ExitStack
-
-import numpy as np
-import mindspore
-from mindspore.common.api import _pynative_executor
-from .import_utils import is_mindspore_available
-
-
-def is_tensor(x):
-    """
-    Tests if `x` is a `mindspore.Tensor` or `np.ndarray`.
-    """
-    if isinstance(x, mindspore.Tensor):
-        return True
-
-    return isinstance(x, np.ndarray)
-
-def _is_mindspore(x):
-    """
-    Checks if the input x is a MindSpore tensor.
-    
-    Args:
-        x (object): The input object to be checked.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return isinstance(x, mindspore.Tensor)
-
-
-def is_mindspore_tensor(x):
-    """
-    Tests if `x` is a torch tensor or not. Safe to call even if torch is not installed.
-    """
-    return False if not is_mindspore_available() else _is_mindspore(x)
-
-class ExplicitEnum(str, Enum):
-    """
-    Enum with more explicit error message for missing values.
-    """
-    @classmethod
-    def _missing_(cls, value):
-        """
-        This method `_missing_` in the class `ExplicitEnum` is a class method used to handle missing values in the ExplicitEnum class.
-        
-        Args:
-            cls (class): The class itself, used for referring to the class instance inside the method.
-            value (any): The value that was not found in the ExplicitEnum class.
-            
-        Returns:
-            None: This method does not return any value as it raises an exception when called.
-        
-        Raises:
-            ValueError: If the value provided is not a valid member of the Enum class, a ValueError is raised with a message listing the valid options to choose from.
-        """
-        raise ValueError(
-            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
-        )
-
-class TensorType(ExplicitEnum):
-    """
-    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
-    tab-completion in an IDE.
-    """
-    MINDSPORE = "ms"
-    NUMPY = "np"
-
-class PaddingStrategy(ExplicitEnum):
-    """
-    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
-    IDE.
-    """
-    LONGEST = "longest"
-    MAX_LENGTH = "max_length"
-    DO_NOT_PAD = "do_not_pad"
-
-
-class ModelOutput(OrderedDict):
-    """
-    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
-    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
-    python dictionary.
-
-    <Tip warning={true}>
-
-    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
-    before.
-
-    </Tip>
-    """
-
-    def __post_init__(self):
-        """Perform post-initialization actions for the ModelOutput class.
-        
-        This method is automatically called after the initialization of a ModelOutput object.
-        
-        Args:
-            self: An instance of the ModelOutput class.
-        
-        Returns:
-            None
-        
-        Raises:
-            ValueError: If the ModelOutput object has no fields or more than one required field.
-            ValueError: If a key/value pair in the first field is not a tuple or if it does not follow the format (key, value).
-            ValueError: If the key/value pair cannot be set for a given element in the first field.
-        """
-        class_fields = fields(self)
-
-        # Safety and consistency checks
-        if len(class_fields) == 0:
-            raise ValueError(f"{self.__class__.__name__} has no fields.")
-        if not all(field.default is None for field in class_fields[1:]):
-            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
-
-        first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
-
-        if other_fields_are_none and not is_tensor(first_field):
-            if isinstance(first_field, dict):
-                iterator = first_field.items()
-                first_field_iterator = True
-            else:
-                try:
-                    iterator = iter(first_field)
-                    first_field_iterator = True
-                except TypeError:
-                    first_field_iterator = False
-
-            # if we provided an iterator as first field and the iterator is a (key, value) iterator
-            # set the associated fields
-            if first_field_iterator:
-                for idx, element in enumerate(iterator):
-                    if (
-                            not isinstance(element, (list, tuple))
-                            or not len(element) == 2
-                            or not isinstance(element[0], str)
-                    ):
-                        if idx == 0:
-                            # If we do not have an iterator of key/values, set it as attribute
-                            self[class_fields[0].name] = first_field
-                        else:
-                            # If we have a mixed iterator, raise an error
-                            raise ValueError(
-                                f"Cannot set key/value for {element}. It needs to be a tuple (key, value)."
-                            )
-                        break
-                    setattr(self, element[0], element[1])
-                    if element[1] is not None:
-                        self[element[0]] = element[1]
-            elif first_field is not None:
-                self[class_fields[0].name] = first_field
-        else:
-            for field in class_fields:
-                v = getattr(self, field.name)
-                if v is not None:
-                    self[field.name] = v
-
-    def __delitem__(self, *args, **kwargs):
-        """
-        __delitem__
-        
-        Deletes an item from the ModelOutput instance.
-        
-        Args:
-            self (ModelOutput): The ModelOutput instance from which the item will be deleted.
-        
-        Returns:
-            None. This method does not return a value.
-        
-        Raises:
-            RuntimeError: If the '__delitem__' method is attempted to be used on a ModelOutput instance, a RuntimeError is raised with a message indicating that this method cannot be used on the instance.
-        """
-        raise RuntimeError(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        """
-        Sets a default value in the ModelOutput instance.
-        
-        Args:
-            self: The ModelOutput instance itself.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            RuntimeError: This exception is raised if the method 'setdefault' is called on a ModelOutput instance. The message in the exception states that the 'setdefault' method cannot be used on a
-ModelOutput instance.
-        
-        Note:
-            The 'setdefault' method is not supported for ModelOutput instances as it can only be used on dictionary objects.
-        """
-        raise RuntimeError(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        """
-        Method that raises a RuntimeError to prevent the use of 'pop' on a ModelOutput instance.
-        
-        Args:
-            self (object): The ModelOutput instance on which 'pop' is being called. 
-                           This parameter is required and represents the current instance of the class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            RuntimeError: Raised when attempting to use 'pop' method on a ModelOutput instance. The exception message
-                          specifies that 'pop' cannot be used on a ModelOutput instance to prevent unintended behavior.
-        """
-        raise RuntimeError(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        """
-        Updates the current instance of the ModelOutput class.
-        
-        Args:
-            self (ModelOutput): The instance of the ModelOutput class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            RuntimeError: If the method is called on an instance of the ModelOutput class. This is to prevent using the 'update' method on a ModelOutput instance, as it is not allowed.
-        
-        Note:
-            The 'update' method is not allowed to be used on a ModelOutput instance. If called, it will raise a RuntimeError.
-        """
-        raise RuntimeError(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __getitem__(self, k):
-        """
-        This method allows accessing the elements of the ModelOutput object using the square bracket notation.
-        
-        Args:
-            self (ModelOutput): The instance of the ModelOutput class.
-            k (str or int): The key or index for accessing the element. If k is a string, it is used as a key to retrieve the corresponding value. If k is an integer, it is used as an index to retrieve the
-element. 
-        
-        Returns:
-            None: This method does not return any value directly. The retrieved value is returned based on the input key or index.
-        
-        Raises:
-            TypeError: If the input parameter k is not a string or an integer.
-            KeyError: If the input key k is not found in the internal dictionary when k is a string.
-            IndexError: If the input index k is out of range when k is an integer.
-        """
-        if isinstance(k, str):
-            inner_dict = dict(self.items())
-            return inner_dict[k]
-        return self.to_tuple()[k]
-
-    def __setattr__(self, name, value):
-        """
-        Method __setattr__ in the class ModelOutput sets the value for the specified attribute name.
-        
-        Args:
-            self (object): The instance of the ModelOutput class.
-            name (str): The name of the attribute to be set.
-            value (any): The value to be assigned to the attribute. It can be of any type.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method. However, if the attribute name is not in the keys of the object, it will be added as a new attribute. If the value is None, the attribute will be
-set to None.
-        """
-        if name in self.keys() and value is not None:
-            # Don't call self.__setitem__ to avoid recursion errors
-            super().__setitem__(name, value)
-        super().__setattr__(name, value)
-
-    def __setitem__(self, key, value):
-        """
-        This method '__setitem__' in the class 'ModelOutput' allows setting key-value pairs in the model output object.
-        
-        Args:
-            self (ModelOutput): The instance of the ModelOutput class.
-            key (Any): The key to be set in the model output object.
-            value (Any): The value corresponding to the key to be set in the model output object.
-        
-        Returns:
-            None. This method does not return any value explicitly.
-        
-        Raises:
-            This method may raise the following exceptions:
-            - TypeError: If the key is not of a valid type.
-            - ValueError: If the value is not acceptable for the given key.
-            - Other exceptions related to the internal implementation of the ModelOutput class.
-        """
-        # Will raise a KeyException if needed
-        super().__setitem__(key, value)
-        # Don't call self.__setattr__ to avoid recursion errors
-        super().__setattr__(key, value)
-
-    def to_tuple(self) -> Tuple[Any]:
-        """
-        Convert self to a tuple containing all the attributes/keys that are not `None`.
-        """
-        return tuple(v for _, v in self.items())
-
-# vendored from distutils.util
-def strtobool(val):
-    """Convert a string representation of truth to true (1) or false (0).
-
-    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
-    Raises ValueError if 'val' is anything else.
-    """
-    val = val.lower()
-    if val in {"y", "yes", "t", "true", "on", "1"}:
-        return 1
-    if val in {"n", "no", "f", "false", "off", "0"}:
-        return 0
-    raise ValueError(f"invalid truth value {val!r}")
-
-class cached_property(property):
-    """
-    Descriptor that mimics @property but caches output in member variable.
-
-    From tensorflow_datasets
-
-    Built-in in functools from Python 3.8.
-    """
-    def __get__(self, obj, objtype=None):
-        """ 
-        Method '__get__' in the class 'cached_property'.
-        
-        Args:
-            self (object): The current instance of the class.
-            obj (object): The object on which the method is being called.
-            objtype (object): The type of the object, if available. Defaults to None.
-        
-        Returns:
-            None: The method returns a value of type None.
-        
-        Raises:
-            AttributeError: If the attribute is unreadable, this exception is raised.
-        """
-        # See docs.python.org/3/howto/descriptor.html#properties
-        if obj is None:
-            return self
-        if self.fget is None:
-            raise AttributeError("unreadable attribute")
-        attr = "__cached_" + self.fget.__name__
-        cached = getattr(obj, attr, None)
-        if cached is None:
-            cached = self.fget(obj)
-            setattr(obj, attr, cached)
-        return cached
-
-def _is_numpy(x):
-    """
-    This function checks if the input is a NumPy array.
-    
-    Args:
-        x (any): The input to be checked for being a NumPy array.
-    
-    Returns:
-        None: This function does not return a value.
-    
-    Raises:
-        None
-    """
-    return isinstance(x, np.ndarray)
-
-
-def is_numpy_array(x):
-    """
-    Tests if `x` is a numpy array or not.
-    """
-    return _is_numpy(x)
-
-def infer_framework_from_repr(x):
-    """
-    Tries to guess the framework of an object `x` from its repr (brittle but will help in `is_tensor` to try the
-    frameworks in a smart order, without the need to import the frameworks).
-    """
-    representation = str(type(x))
-    if representation.startswith("<class 'mindspore."):
-        return "ms"
-    if representation.startswith("<class 'numpy."):
-        return "np"
-
-def _get_frameworks_and_test_func(x):
-    """
-    Returns an (ordered since we are in Python 3.7+) dictionary framework to test function, which places the framework
-    we can guess from the repr first, then Numpy, then the others.
-    """
-    framework_to_test = {
-        "ms": is_mindspore_tensor,
-        "np": is_numpy_array,
-    }
-    preferred_framework = infer_framework_from_repr(x)
-    # We will test this one first, then numpy, then the others.
-    frameworks = [] if preferred_framework is None else [preferred_framework]
-    if preferred_framework != "np":
-        frameworks.append("np")
-    frameworks.extend([f for f in framework_to_test if f not in [preferred_framework, "np"]])
-    return {f: framework_to_test[f] for f in frameworks}
-
-
-def to_py_obj(obj):
-    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
-    """
-    framework_to_py_obj = {
-        "ms": lambda obj: obj.asnumpy().tolist(),
-        "np": lambda obj: obj.tolist(),
-    }
-
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_py_obj(v) for k, v in obj.items()}
-    if isinstance(obj, (list, tuple)):
-        return [to_py_obj(o) for o in obj]
-
-    # This gives us a smart order to test the frameworks with the corresponding tests.
-    framework_to_test_func = _get_frameworks_and_test_func(obj)
-    for framework, test_func in framework_to_test_func.items():
-        if test_func(obj):
-            return framework_to_py_obj[framework](obj)
-
-    # tolist also works on 0d np arrays
-    if isinstance(obj, np.number):
-        return obj.tolist()
-    return obj
-
-def to_numpy(obj):
-    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
-    """
-    framework_to_numpy = {
-        "ms": lambda obj: obj.asnumpy(),
-        "np": lambda obj: obj,
-    }
-
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_numpy(v) for k, v in obj.items()}
-    if isinstance(obj, (list, tuple)):
-        return np.array(obj)
-
-    # This gives us a smart order to test the frameworks with the corresponding tests.
-    framework_to_test_func = _get_frameworks_and_test_func(obj)
-    for framework, test_func in framework_to_test_func.items():
-        if test_func(obj):
-            return framework_to_numpy[framework](obj)
-
-    return obj
-
-class ContextManagers:
-    """
-    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
-    in the `fastcore` library.
-    """
-    def __init__(self, context_managers: List[ContextManager]):
-        """
-        __init__
-        
-        Args:
-            self: The instance of the class.
-            context_managers (List[ContextManager]): A list of context managers to be initialized.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        self.context_managers = context_managers
-        self.stack = ExitStack()
-
-    def __enter__(self):
-        """
-        Method '__enter__' in the class 'ContextManagers'.
-        
-        Args:
-            self (object): The instance of the ContextManagers class on which the method is called. It is used to access the instance attributes and methods.
-            
-        Returns:
-            None. This method does not return any value explicitly, it performs context management operations within the class.
-            
-        Raises:
-            This method may raise exceptions if the context managers encountered during the iteration in the for loop raise any exceptions. Ensure proper error handling is in place to catch and handle any
-exceptions that may occur during the context management operations.
-        """
-        for context_manager in self.context_managers:
-            self.stack.enter_context(context_manager)
-
-    def __exit__(self, *args, **kwargs):
-        """
-        __exit__
-        
-        Method in the class ContextManagers.
-        
-        Args:
-            self: (object) The instance of the class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            This method does not explicitly raise any exceptions.
-        """
-        self.stack.__exit__(*args, **kwargs)
-
-def no_grad(func):
-    """no grad wrapper"""
-    def wrapper(*args, **kwargs):
-        _pynative_executor.set_enable_grad(False)
-        outputs = func(*args, **kwargs)
-        _pynative_executor.set_enable_grad(True)
-        return outputs
-    return wrapper
-
-def find_labels(model_class):
-    """
-    Find the labels used by a given model.
-
-    Args:
-        model_class (`type`): The class of the model.
-    """
-    model_name = model_class.__name__
-    signature = inspect.signature(model_class.forward)  # TensorFlow models
-
-    if "QuestionAnswering" in model_name:
-        return [p for p in signature.parameters if "label" in p or p in ("start_positions", "end_positions")]
-    else:
-        return [p for p in signature.parameters if "label" in p]
-
-def can_return_loss(model_class):
-    """
-    Check if a given model can return loss.
-
-    Args:
-        model_class (`type`): The class of the model.
-    """
-    signature = inspect.signature(model_class.forward)  # TensorFlow models
-
-    for p in signature.parameters:
-        if p == "return_loss" and signature.parameters[p].default is True:
-            return True
-
-    return False
diff --git a/mindnlp/utils/import_utils.py b/mindnlp/utils/import_utils.py
deleted file mode 100644
index 547f2efd9..000000000
--- a/mindnlp/utils/import_utils.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Import utilities: Utilities related to imports and our lazy inits.
-"""
-
-import os
-import sys
-import warnings
-from types import ModuleType
-from collections import OrderedDict
-from functools import wraps, lru_cache
-from typing import Tuple, Union
-import importlib.util
-from packaging import version
-
-from . import logging
-
-if sys.version_info >= (3, 8):
-    # For Python 3.8 and later
-    from importlib import metadata as importlib_metadata
-else:
-    # For Python versions earlier than 3.8
-    import importlib_metadata
-
-
-logger = logging.get_logger(__name__)
-
-def _is_package_available(
-        pkg_name: str, return_version: bool = False
-) -> Union[Tuple[bool, str], bool]:
-    """
-    Checks if a specified package is available and optionally returns its version.
-    
-    Args:
-        pkg_name (str): The name of the package to check for availability.
-        return_version (bool, optional): Indicates whether to return the package version along with availability status. Defaults to False.
-    
-    Returns:
-        Union[Tuple[bool, str], bool]: If return_version is True, returns a tuple containing a boolean indicating package availability and a string representing the package version. 
-        If return_version is False, returns a boolean indicating package availability.
-    
-    Raises:
-        No specific exceptions are raised within this function.
-    """
-    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
-    package_exists = importlib.util.find_spec(pkg_name) is not None
-    package_version = "N/A"
-    if package_exists:
-        try:
-            package_version = importlib_metadata.version(pkg_name)
-            package_exists = True
-        except importlib_metadata.PackageNotFoundError:
-            package_exists = False
-        logger.debug(f"Detected {pkg_name} version {package_version}")
-    if return_version:
-        return package_exists, package_version
-    return package_exists
-
-
-_ftfy_available = _is_package_available("ftfy")
-_einops_available = _is_package_available('einops')
-_tiktoken_available = _is_package_available('tiktoken')
-_bs4_available = importlib.util.find_spec("bs4") is not None
-_pytest_available = _is_package_available("pytest")
-_datasets_available = _is_package_available("datasets")
-_sentencepiece_available = _is_package_available("sentencepiece")
-_soundfile_available = _is_package_available("soundfile")
-_tokenizers_available = _is_package_available("tokenizers")
-_pyctcdecode_available = _is_package_available("pyctcdecode")
-_safetensors_available = _is_package_available("safetensors")
-_modelscope_available = _is_package_available("modelscope")
-_jieba_available = _is_package_available("jieba")
-_pytesseract_available = _is_package_available("pytesseract")
-_g2p_en_available = _is_package_available("g2p_en")
-_phonemizer_available = _is_package_available("phonemizer")
-_mindspore_version, _mindspore_available = _is_package_available(
-    "mindspore", return_version=True
-)
-_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
-
-_librosa_available = _is_package_available("librosa")
-_scipy_available = _is_package_available("scipy")
-_triton_available = _is_package_available("triton")
-_sacremoses_available = _is_package_available("sacremoses")
-_torchaudio_available = _is_package_available("pykaldi")
-_kenlm_available = _is_package_available("kenlm")
-_datamodel_code_generator_availabel = _is_package_available('datamodel_code_generator')
-_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None
-try:
-    _pretty_midi_version = importlib_metadata.version("pretty_midi")
-    logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pretty_midi_available = False
-
-_essentia_available = importlib.util.find_spec("essentia") is not None
-try:
-    _essentia_version = importlib_metadata.version("essentia")
-    logger.debug(f"Successfully imported essentia version {_essentia_version}")
-except importlib_metadata.PackageNotFoundError:
-    _essentia_version = False
-
-_levenshtein_available = _is_package_available("Levenshtein")
-_nltk_available = _is_package_available("nltk")
-
-
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib.metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib.metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib.metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib.metadata.PackageNotFoundError:
-        _faiss_available = False
-
-def is_triton_available():
-    return _triton_available
-
-def is_datamodel_code_generator_availabel():
-    return _datamodel_code_generator_availabel
-
-def is_faiss_available():
-    return _faiss_available
-
-def is_levenshtein_available():
-    return _levenshtein_available
-
-
-def is_nltk_available():
-    return _nltk_available
-
-
-def is_einops_available():
-    return _einops_available
-
-
-def is_sudachi_available():
-    """
-    Checks if SudachiPy is available for use.
-    
-    Returns:
-        None: Indicates whether SudachiPy is available or not.
-    
-    """
-    return _sudachipy_available
-
-
-def get_sudachi_version():
-    '''
-    Returns the version of SudachiPy.
-    
-    Returns:
-        None: This function does not take any parameters.
-    
-    Raises:
-        None
-    '''
-    return _sudachipy_version
-
-
-def is_bs4_available():
-    return _bs4_available
-
-def is_sudachi_projection_available():
-    """
-    Checks if Sudachi projection is available.
-    
-    This function checks if Sudachi is available and if the Sudachi version is equal to or greater than 0.6.8.
-    
-    Returns:
-        None
-    
-    Raises:
-        None
-    """
-    if not is_sudachi_available():
-        return False
-
-    # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the forwardor of BertJapaneseTokenizer.
-    # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
-    return version.parse(_sudachipy_version) >= version.parse("0.6.8")
-
-def is_sacremoses_available():
-    """
-    Checks if the sacremoses library is available in the current environment.
-    
-    Returns:
-        None: Indicates whether the sacremoses library is available or not.
-    
-    Raises:
-        None.
-    """
-    return _sacremoses_available
-
-
-def is_mindspore_available():
-    '''
-    Checks if MindSpore is available.
-    
-    Args:
-        None
-    
-    Returns:
-        None: Indicates that the function does not return any value.
-    
-    Raises:
-        None: No exceptions are raised by this function.
-    '''
-    return _mindspore_available
-
-
-def get_mindspore_version():
-    """
-    Returns the current version of MindSpore.
-    
-    Args:
-    
-    Returns:
-        None: This function does not take any parameters.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return _mindspore_version
-
-
-
-def is_ftfy_available():
-    return _ftfy_available
-
-
-def is_datasets_available():
-    """
-    Checks if datasets are available.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return _datasets_available
-
-
-def is_sentencepiece_available():
-    """
-    Checks if SentencePiece library is available.
-    
-    Returns:
-        None: Indicates whether the SentencePiece library is available or not.
-    
-    Raises:
-        None.
-    """
-    return _sentencepiece_available
-
-
-def is_tokenizers_available():
-    """Check if tokenizers are available.
-    
-    This function checks if tokenizers are available for use. It does not take any parameters.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return _tokenizers_available
-
-
-def is_safetensors_available():
-    """
-    Checks if SafeTensors is available in the current environment.
-    
-    Returns:
-        None: Indicates whether SafeTensors is available or not.
-    
-    """
-    return _safetensors_available
-
-
-def is_modelscope_available():
-    '''
-    Checks if the model scope is available.
-    
-    Returns:
-        None: Indicates whether the model scope is available or not.
-    '''
-    return _modelscope_available
-
-
-def is_cython_available():
-    """
-    Checks if Cython is available in the current environment.
-    
-    Returns:
-        None: Indicates whether Cython is available or not.
-    
-    Raises:
-        None
-    """
-    return importlib.util.find_spec("pyximport") is not None
-
-
-def is_protobuf_available():
-    """
-    Checks if the Google Protocol Buffers (protobuf) library is available.
-    
-    Returns:
-        bool: True if the protobuf library is available, False otherwise.
-    
-    Raises:
-        No specific exceptions are raised by this function.
-    """
-    if importlib.util.find_spec("google") is None:
-        return False
-    return importlib.util.find_spec("google.protobuf") is not None
-
-
-def is_pytest_available():
-    """
-    Check if the pytest library is available.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    """
-    return _pytest_available
-
-
-def is_pretty_midi_available():
-    """
-    Checks if the 'pretty_midi' library is available.
-    
-    Returns:
-        None
-    
-    Raises:
-        None
-    """
-    return _pretty_midi_available
-
-
-def is_librosa_available():
-    """
-    Checks if the 'librosa' library is available.
-    
-    Returns:
-        None
-    
-    Raises:
-        None
-    """
-    return _librosa_available
-
-
-def is_essentia_available():
-    """
-    Checks if the 'essentia' library is available.
-    
-    Returns:
-        None.
-    
-    Raises:
-        None.
-    """
-    return _essentia_available
-
-
-def is_pyctcdecode_available():
-    """
-    Check if the PyCTCDecode library is available.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None
-    """
-    return _pyctcdecode_available
-
-
-def is_scipy_available():
-    """
-    Checks if the SciPy library is available.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return _scipy_available
-
-
-def is_jieba_available():
-    ''' 
-    Checks if the Jieba library is available.
-    
-    Returns:
-        None: The function does not return any value.
-    
-    '''
-    return _jieba_available
-
-
-def is_pytesseract_available():
-    """
-    Check if pytesseract library is available.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    return _pytesseract_available
-
-
-def is_g2p_en_available():
-    return _g2p_en_available
-
-
-def is_tiktoken_available():
-    return _tiktoken_available
-
-
-def is_phonemizer_available():
-    return _phonemizer_available
-
-
-@lru_cache()
-def is_vision_available():
-    """
-    Checks if the Pillow library is available for image processing.
-    
-    Returns:
-        bool: True if Pillow library is available, False otherwise.
-    
-    Raises:
-        PackageNotFoundError: If Pillow or Pillow-SIMD package is not found.
-    """
-    _pil_available = importlib.util.find_spec("PIL") is not None
-    if _pil_available:
-        try:
-            package_version = importlib_metadata.version("Pillow")
-        except importlib_metadata.PackageNotFoundError:
-            try:
-                package_version = importlib_metadata.version("Pillow-SIMD")
-            except importlib_metadata.PackageNotFoundError:
-                return False
-        logger.debug(f"Detected PIL version {package_version}")
-    return _pil_available
-
-
-def is_in_notebook():
-    """
-    This function checks if the code is running in a Jupyter notebook environment by examining the current execution environment and relevant environment variables.
-    
-    Returns:
-        bool: Returns True if the code is running in a Jupyter notebook environment, otherwise False.
-    
-    Raises:
-        AttributeError: If an attribute error occurs during the execution of the function.
-        ImportError: If the code is running in the console, VS Code, or Databricks environment, respective ImportError with the environment name is raised.
-        KeyError: If a key error occurs during the execution of the function.
-    """
-    try:
-        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
-        get_ipython = sys.modules["IPython"].get_ipython
-        if "IPKernelApp" not in get_ipython().config:
-            raise ImportError("console")
-        if "VSCODE_PID" in os.environ:
-            raise ImportError("vscode")
-        if (
-                "DATABRICKS_RUNTIME_VERSION" in os.environ
-                and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0"
-        ):
-            # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
-            # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
-            raise ImportError("databricks")
-
-        return importlib.util.find_spec("IPython") is not None
-    except (AttributeError, ImportError, KeyError):
-        return False
-
-
-# docstyle-ignore
-CYTHON_IMPORT_ERROR = """
-{0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
-Cython`. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-DATASETS_IMPORT_ERROR = """
-{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
-```
-pip install datasets
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install datasets
-```
-then restarting your kernel.
-
-Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
-working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
-that python file if that's the case. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-TOKENIZERS_IMPORT_ERROR = """
-{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
-```
-pip install tokenizers
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install tokenizers
-```
-Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-SENTENCEPIECE_IMPORT_ERROR = """
-{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
-that match your environment. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-PROTOBUF_IMPORT_ERROR = """
-{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
-that match your environment. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-MINDSPORE_IMPORT_ERROR = """
-{0} requires the MindSpore library but it was not found in your environment. Checkout the instructions on the
-installation page: https://www.mindspore.cn/install/ and follow the ones that match your environment.
-Please note that you may need to restart your runtime after installation.
-"""
-
-LIBROSA_IMPORT_ERROR = """
-{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
-`pip install librosa`
-Please note that you may need to restart your runtime after installation.
-"""
-
-ESSENTIA_IMPORT_ERROR = """
-{0} requires essentia library. But that was not found in your environment. You can install them with pip:
-`pip install essentia==2.1b6.dev1034`
-Please note that you may need to restart your runtime after installation.
-"""
-
-SCIPY_IMPORT_ERROR = """
-{0} requires the scipy library but it was not found in your environment. You can install it with pip:
-`pip install scipy`. Please note that you may need to restart your runtime after installation.
-"""
-
-PRETTY_MIDI_IMPORT_ERROR = """
-{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
-`pip install pretty_midi`
-Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-PYCTCDECODE_IMPORT_ERROR = """
-{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
-`pip install pyctcdecode`. Please note that you may need to restart your runtime after installation.
-"""
-
-JIEBA_IMPORT_ERROR = """
-{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
-jieba`. Please note that you may need to restart your runtime after installation.
-"""
-
-VISION_IMPORT_ERROR = """
-{0} requires the PIL library but it was not found in your environment. You can install it with pip:
-`pip install pillow`. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-G2P_EN_IMPORT_ERROR = """
-{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
-`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
-"""
-
-BACKENDS_MAPPING = OrderedDict(
-    [
-        ("mindspore", (is_mindspore_available, MINDSPORE_IMPORT_ERROR)),
-        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
-        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
-        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
-        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
-        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
-        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
-        ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
-        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
-        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
-        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
-        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
-        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
-        ("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
-    ]
-)
-
-
-def requires_backends(obj, backends):
-    """
-    Function to check if the specified backends are available for the given object.
-    
-    Args:
-        obj (object): The object for which backends availability needs to be checked.
-        backends (list or tuple or str): The backend(s) to be checked for availability. Can be a single backend as a string or a list/tuple of backends.
-    
-    Returns:
-        None. This function does not return any value.
-    
-    Raises:
-        ImportError: If any of the specified backends are not available for the object.
-    """
-    if not isinstance(backends, (list, tuple)):
-        backends = [backends]
-
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-
-    checks = (BACKENDS_MAPPING[backend] for backend in backends)
-    failed = [msg.format(name) for available, msg in checks if not available()]
-    if failed:
-        raise ImportError("".join(failed))
-
-
-class DummyObject(type):
-    """
-    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
-    `requires_backend` each time a user tries to access any method of that class.
-    """
-    def __getattribute__(cls, key):
-        """
-        This method is called automatically when an attribute is accessed on the 'DummyObject' class or any of its subclasses.
-        
-        Args:
-            cls (type): The class object that the method was called on.
-            key (str): The name of the attribute being accessed.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        if key.startswith("_") and key != "_from_config":
-            return super().__getattribute__(key)
-        requires_backends(cls, cls._backends)
-
-
-def mindspore_required(func):
-    """
-    This function decorates another function to require the presence of MindSpore framework. 
-    
-    Args:
-        func (function): The function to be decorated. 
-    
-    Returns:
-        None. The function returns None.
-    
-    Raises:
-        FutureWarning: If the method `torch_required` is deprecated. 
-        ImportError: If the decorated function requires MindSpore but MindSpore is not available.
-    """
-    warnings.warn(
-        "The method `torch_required` is deprecated. Use `requires_backends` instead.",
-        FutureWarning,
-    )
-
-    # Chose a different decorator name than in tests so it's clear they are not the same.
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if is_mindspore_available():
-            return func(*args, **kwargs)
-        raise ImportError(f"Method `{func.__name__}` requires MindSpore.")
-
-    return wrapper
-
-
-class OptionalDependencyNotAvailable(BaseException):
-    """Internally used error class for signalling an optional dependency was not found."""
-def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
-    """Imports transformers directly
-
-    Args:
-        path (`str`): The path to the source file
-        file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
-
-    Returns:
-        `ModuleType`: The resulting imported module
-    """
-    name = "mindnlp.transformers"
-    location = os.path.join(path, file)
-    spec = importlib.util.spec_from_file_location(
-        name, location, submodule_search_locations=[path]
-    )
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    module = sys.modules[name]
-    return module
-
-
-def is_soundfile_availble():
-    return _soundfile_available
-
-
-def is_speech_available():
-    return _torchaudio_available
-
-
-def is_kenlm_available():
-    return _kenlm_available
diff --git a/mindnlp/utils/logging.py b/mindnlp/utils/logging.py
deleted file mode 100644
index 005e461f7..000000000
--- a/mindnlp/utils/logging.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Optuna, Hugging Face
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=unused-import
-""" Logging utilities."""
-
-import functools
-import logging
-import os
-import sys
-import threading
-from logging import (
-    CRITICAL,  # NOQA
-    DEBUG,  # NOQA
-    ERROR,  # NOQA
-    FATAL,  # NOQA
-    INFO,  # NOQA
-    NOTSET,  # NOQA
-    WARN,  # NOQA
-    WARNING,  # NOQA
-)
-from logging import captureWarnings as _captureWarnings
-from typing import Optional
-
-from tqdm import auto as tqdm_lib
-
-
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-
-log_levels = {
-    "detail": logging.DEBUG,  # will also print filename and line number
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-_default_log_level = logging.WARNING
-
-_tqdm_active = True
-
-
-def _get_default_logging_level():
-    """
-    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to `_default_log_level`
-    """
-    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
-    if env_level_str:
-        if env_level_str in log_levels:
-            return log_levels[env_level_str]
-        logging.getLogger().warning(
-            f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
-            f"has to be one of: { ', '.join(log_levels.keys()) }"
-        )
-    return _default_log_level
-
-
-def _get_library_name() -> str:
-    """
-    Returns the name of the library based on the module name.
-    
-    Returns:
-        str: The name of the library extracted from the module name.
-    
-    """
-    return __name__.split(".")[0] # pylint: disable=use-maxsplit-arg
-
-
-def _get_library_root_logger() -> logging.Logger:
-    """
-    Retrieves the root logger for the library.
-    
-    Returns:
-        A logging.Logger object representing the root logger for the library.
-    
-    Raises:
-        None.
-    """
-    return logging.getLogger(_get_library_name())
-
-
-def _configure_library_root_logger() -> None:
-    """
-    This function configures the root logger for the library.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        None
-    """
-    global _default_handler
-
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
-        if sys.stderr is None:
-            sys.stderr = open(os.devnull, "w")
-
-        _default_handler.flush = sys.stderr.flush
-
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_get_default_logging_level())
-        # if logging level is debug, we add pathname and lineno to formatter for easy debugging
-        if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
-            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
-            _default_handler.setFormatter(formatter)
-
-        library_root_logger.propagate = False
-
-
-def _reset_library_root_logger() -> None:
-    """
-    Resets the root logger of the library to its default state.
-    
-    Args:
-        None
-    
-    Returns:
-        None. The function does not return any value.
-    
-    Raises:
-        None
-    """
-    global _default_handler
-
-    with _lock:
-        if not _default_handler:
-            return
-
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-
-
-def get_log_levels_dict():
-    """
-    Returns a dictionary of log levels.
-    
-    Returns:
-        dict: A dictionary containing log levels and their corresponding values.
-    """
-    return log_levels
-
-
-def captureWarnings(capture):
-    """
-    Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
-    `warnings` library.
-
-    Read more about this method here:
-    https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
-
-    All warnings will be logged through the `py.warnings` logger.
-
-    Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
-    level of that logger to the library's root logger.
-    """
-    logger = get_logger("py.warnings")
-
-    if not logger.handlers:
-        logger.addHandler(_default_handler)
-
-    logger.setLevel(_get_library_root_logger().level)
-
-    _captureWarnings(capture)
-
-
-def get_logger(name: Optional[str] = None) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-
-    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
-    """
-    if name is None:
-        name = _get_library_name()
-
-    _configure_library_root_logger()
-    return logging.getLogger(name)
-
-
-def get_verbosity() -> int:
-    """
-    Return the current level for the 🤗 Transformers's root logger as an int.
-
-    Returns:
-        `int`: The logging level.
-
-    <Tip>
-
-    🤗 Transformers has following logging levels:
-
-    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-    - 40: `transformers.logging.ERROR`
-    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
-    - 20: `transformers.logging.INFO`
-    - 10: `transformers.logging.DEBUG`
-
-    </Tip>"""
-    _configure_library_root_logger()
-    return _get_library_root_logger().getEffectiveLevel()
-
-
-def set_verbosity(verbosity: int) -> None:
-    """
-    Set the verbosity level for the 🤗 Transformers's root logger.
-
-    Args:
-        verbosity (`int`):
-            Logging level, e.g., one of:
-
-            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-            - `transformers.logging.ERROR`
-            - `transformers.logging.WARNING` or `transformers.logging.WARN`
-            - `transformers.logging.INFO`
-            - `transformers.logging.DEBUG`
-    """
-    _configure_library_root_logger()
-    _get_library_root_logger().setLevel(verbosity)
-
-
-def set_verbosity_info():
-    """Set the verbosity to the `INFO` level."""
-    return set_verbosity(INFO)
-
-
-def set_verbosity_warning():
-    """Set the verbosity to the `WARNING` level."""
-    return set_verbosity(WARNING)
-
-
-def set_verbosity_debug():
-    """Set the verbosity to the `DEBUG` level."""
-    return set_verbosity(DEBUG)
-
-
-def set_verbosity_error():
-    """Set the verbosity to the `ERROR` level."""
-    return set_verbosity(ERROR)
-
-
-def disable_default_handler() -> None:
-    """Disable the default handler of the HuggingFace Transformers's root logger."""
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().removeHandler(_default_handler)
-
-
-def enable_default_handler() -> None:
-    """Enable the default handler of the HuggingFace Transformers's root logger."""
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().addHandler(_default_handler)
-
-
-def add_handler(handler: logging.Handler) -> None:
-    """adds a handler to the HuggingFace Transformers's root logger."""
-    _configure_library_root_logger()
-
-    assert handler is not None
-    _get_library_root_logger().addHandler(handler)
-
-
-def remove_handler(handler: logging.Handler) -> None:
-    """removes given handler from the HuggingFace Transformers's root logger."""
-    _configure_library_root_logger()
-
-    assert handler is not None and handler not in _get_library_root_logger().handlers
-    _get_library_root_logger().removeHandler(handler)
-
-
-def disable_propagation() -> None:
-    """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = False
-
-
-def enable_propagation() -> None:
-    """
-    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
-    prevent double logging if the root logger has been configured.
-    """
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = True
-
-
-def enable_explicit_format() -> None:
-    """
-    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
-    ```
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-    ```
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
-        handler.setFormatter(formatter)
-
-
-def reset_format() -> None:
-    """
-    Resets the formatting for HuggingFace Transformers's loggers.
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        handler.setFormatter(None)
-
-
-def warning_advice(self, *args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
-    warning will not be printed
-    """
-    no_advisory_warnings = os.getenv("NO_ADVISORY_WARNINGS", False) # pylint: disable=invalid-envvar-default
-    if no_advisory_warnings:
-        return
-    self.warning(*args, **kwargs)
-
-
-logging.Logger.warning_advice = warning_advice
-
-
-@functools.lru_cache(None)
-def warning_once(self, *args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
-
-    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
-    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
-    another type of cache that includes the caller frame information in the hashing function.
-    """
-    self.warning(*args, **kwargs)
-
-
-logging.Logger.warning_once = warning_once
-
-
-class EmptyTqdm:
-    """Dummy tqdm which doesn't do anything."""
-    def __init__(self, *args, **kwargs):
-        """
-        Initializes an instance of the EmptyTqdm class.
-        
-        Args:
-            self: The instance of the EmptyTqdm class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        self._iterator = args[0] if args else None
-
-    def __iter__(self):
-        """
-        This method implements the iterator protocol for the EmptyTqdm class.
-        
-        Args:
-            self: EmptyTqdm object. The instance of the EmptyTqdm class for which the iterator is being created.
-        
-        Returns:
-            None. This method returns an iterator object that iterates over the _iterator attribute of the EmptyTqdm instance.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        return iter(self._iterator)
-
-    def __getattr__(self, _):
-        """Return empty function."""
-        def empty_fn(*args, **kwargs):
-            return
-        return empty_fn
-
-    def __enter__(self):
-        """
-        __enter__
-        
-        Args:
-            self: EmptyTqdm
-                The self parameter refers to the current instance of the EmptyTqdm class.
-        
-        Returns:
-            None
-                This method returns None.
-        
-        Raises:
-            No exceptions are raised by this method.
-        """
-        return self
-
-    def __exit__(self, type_, value, traceback):
-        """
-        __exit__ method in the EmptyTqdm class.
-        
-        Args:
-            self: EmptyTqdm object
-                The instance of the EmptyTqdm class.
-            type_: type
-                The type of the exception. It represents the type of the exception being handled.
-            value: exception
-                The exception that was raised. It represents the actual exception object.
-            traceback: traceback
-                The traceback object. It represents the traceback information associated with the exception.
-        
-        Returns:
-            None
-            This method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions explicitly.
-        """
-        return
-
-
-class _tqdm_cls:
-
-    """_tqdm_cls is a Python class that provides functionality for managing the progress of tasks. It includes methods for calling the class, setting a lock, and getting a lock. This class is designed to work
-in conjunction with the tqdm_lib module for displaying progress bars during iterative processes. When _tqdm_active is True, the class uses methods from the tqdm_lib.tqdm module to handle progress tracking.
-Otherwise, it falls back to using an EmptyTqdm instance for progress tracking. The set_lock method allows users to specify a lock for thread safety, and the get_lock method retrieves the current lock if one
-has been set."""
-    def __call__(self, *args, **kwargs):
-        """
-        This method __call__ in the class _tqdm_cls is used to conditionally return either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
-        
-        Args:
-            self (object): The instance of the _tqdm_cls class. It is used to access the attributes and methods of the class.
-        
-        Returns:
-            None: This method does not explicitly return any value. It returns either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
-        
-        Raises:
-            No specific exceptions are raised by this method under normal circumstances. However, if there are issues related to the instantiation of tqdm objects or EmptyTqdm objects, standard Python
-exceptions may be raised.
-        """
-        if _tqdm_active:
-            return tqdm_lib.tqdm(*args, **kwargs)
-        return EmptyTqdm(*args, **kwargs)
-
-    def set_lock(self, *args, **kwargs):
-        """
-        Method to set the lock for the _tqdm_cls instance.
-        
-        Args:
-            self (_tqdm_cls): The instance of the _tqdm_cls class.
-                This parameter is required to access the instance and set the lock.
-                It is of type _tqdm_cls and represents the instance on which the lock is being set.
-        
-        Returns:
-            None: This method does not return any value. The lock is set within the instance itself.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-            However, if _tqdm_active is False, the method will not set the lock and will return without any further action.
-        """
-        self._lock = None
-        if _tqdm_active:
-            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
-
-    def get_lock(self):
-        """
-        This method is used to retrieve the lock used by the _tqdm_cls class.
-        
-        Args:
-            self (object): The instance of the _tqdm_cls class.
-            
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            N/A
-        """
-        if _tqdm_active:
-            return tqdm_lib.tqdm.get_lock()
-
-
-tqdm = _tqdm_cls()
-
-
-def is_progress_bar_enabled() -> bool:
-    """Return a boolean indicating whether tqdm progress bars are enabled."""
-    global _tqdm_active # pylint: disable=global-variable-not-assigned
-    return bool(_tqdm_active)
-
-
-def enable_progress_bar():
-    """Enable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = True
-
-
-def disable_progress_bar():
-    """Disable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = False
diff --git a/mindnlp/utils/peft_utils.py b/mindnlp/utils/peft_utils.py
deleted file mode 100644
index 6a468a945..000000000
--- a/mindnlp/utils/peft_utils.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""peft utils"""
-import os
-from typing import Dict, Optional, Union
-
-from .download import cached_file
-
-
-ADAPTER_CONFIG_NAME = "adapter_config.json"
-ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
-ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
-
-
-def find_adapter_config_file(
-    model_id: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    subfolder: str = "",
-    mirror: str = "huggingface",
-    _commit_hash: Optional[str] = None,
-) -> Optional[str]:
-    r"""
-    Simply checks if the model stored on the Hub or locally is an adapter model or not, return the path of the adapter
-    config file if it is, None otherwise.
-
-    Args:
-        model_id (`str`):
-            The identifier of the model to look for, can be either a local path or an id to the repository on the Hub.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download:
-            Deprecated and ignored. All downloads are now resumed by default when possible.
-            Will be removed in v5 of Transformers.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-
-            <Tip>
-
-            To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
-
-            </Tip>
-
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-        subfolder (`str`, *optional*, defaults to `""`):
-            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
-            specify the folder name here.
-    """
-    adapter_cached_filename = None
-    if model_id is None:
-        return None
-    elif os.path.isdir(model_id):
-        list_remote_files = os.listdir(model_id)
-        if ADAPTER_CONFIG_NAME in list_remote_files:
-            adapter_cached_filename = os.path.join(model_id, ADAPTER_CONFIG_NAME)
-    else:
-        adapter_cached_filename = cached_file(
-            model_id,
-            ADAPTER_CONFIG_NAME,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            token=token,
-            revision=revision,
-            local_files_only=local_files_only,
-            subfolder=subfolder,
-            mirror=mirror,
-            # _commit_hash=_commit_hash,
-            _raise_exceptions_for_gated_repo=False,
-            _raise_exceptions_for_missing_entries=False,
-            _raise_exceptions_for_connection_errors=False,
-        )
-
-    return adapter_cached_filename
diff --git a/mindnlp/utils/save.py b/mindnlp/utils/save.py
deleted file mode 100644
index 23ecda9b5..000000000
--- a/mindnlp/utils/save.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Save functions
-"""
-from typing import Union
-
-def convert_file_size_to_int(size: Union[int, str]):
-    """
-    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
-
-    Args:
-        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
-
-    Example:
-    ```py
-    >>> convert_file_size_to_int("1MiB")
-    1048576
-    ```
-    """
-    if isinstance(size, int):
-        return size
-    if size.upper().endswith("GIB"):
-        return int(size[:-3]) * (2**30)
-    if size.upper().endswith("MIB"):
-        return int(size[:-3]) * (2**20)
-    if size.upper().endswith("KIB"):
-        return int(size[:-3]) * (2**10)
-    if size.upper().endswith("GB"):
-        int_size = int(size[:-2]) * (10**9)
-        return int_size // 8 if size.endswith("b") else int_size
-    if size.upper().endswith("MB"):
-        int_size = int(size[:-2]) * (10**6)
-        return int_size // 8 if size.endswith("b") else int_size
-    if size.upper().endswith("KB"):
-        int_size = int(size[:-2]) * (10**3)
-        return int_size // 8 if size.endswith("b") else int_size
-    raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
diff --git a/mindnlp/utils/serialization.py b/mindnlp/utils/serialization.py
deleted file mode 100644
index 9afecd7f2..000000000
--- a/mindnlp/utils/serialization.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains helpers to split tensors into shards."""
-
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
-
-from . import logging
-
-
-TensorT = TypeVar("TensorT")
-TensorSizeFn_T = Callable[[TensorT], int]
-StorageIDFn_T = Callable[[TensorT], Optional[Any]]
-
-MAX_SHARD_SIZE = 5_000_000_000  # 5GB
-FILENAME_PATTERN = "model{suffix}.safetensors"
-
-logger = logging.get_logger(__file__)
-
-
-@dataclass
-class StateDictSplit:
-    is_sharded: bool = field(init=False)
-    metadata: Dict[str, Any]
-    filename_to_tensors: Dict[str, List[str]]
-    tensor_to_filename: Dict[str, str]
-
-    def __post_init__(self):
-        self.is_sharded = len(self.filename_to_tensors) > 1
-
-
-def split_state_dict_into_shards_factory(
-    state_dict: Dict[str, TensorT],
-    *,
-    get_tensor_size: TensorSizeFn_T,
-    get_storage_id: StorageIDFn_T = lambda tensor: None,
-    filename_pattern: str = FILENAME_PATTERN,
-    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
-) -> StateDictSplit:
-    """
-    Split a model state dictionary in shards so that each shard is smaller than a given size.
-
-    The shards are determined by iterating through the `state_dict` in the order of its keys. There is no optimization
-    made to make each shard as close as possible to the maximum size passed. For example, if the limit is 10GB and we
-    have tensors of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], [6+2+2GB] and not
-    [6+2+2GB], [6+2GB], [6GB].
-
-    <Tip warning={true}>
-
-    If one of the model's tensor is bigger than `max_shard_size`, it will end up in its own shard which will have a
-    size greater than `max_shard_size`.
-
-    </Tip>
-
-    Args:
-        state_dict (`Dict[str, Tensor]`):
-            The state dictionary to save.
-        get_tensor_size (`Callable[[Tensor], int]`):
-            A function that returns the size of a tensor in bytes.
-        get_storage_id (`Callable[[Tensor], Optional[Any]]`, *optional*):
-            A function that returns a unique identifier to a tensor storage. Multiple different tensors can share the
-            same underlying storage. This identifier is guaranteed to be unique and constant for this tensor's storage
-            during its lifetime. Two tensor storages with non-overlapping lifetimes may have the same id.
-        filename_pattern (`str`, *optional*):
-            The pattern to generate the files names in which the model will be saved. Pattern must be a string that
-            can be formatted with `filename_pattern.format(suffix=...)` and must contain the keyword `suffix`
-            Defaults to `"model{suffix}.safetensors"`.
-        max_shard_size (`int` or `str`, *optional*):
-            The maximum size of each shard, in bytes. Defaults to 5GB.
-
-    Returns:
-        [`StateDictSplit`]: A `StateDictSplit` object containing the shards and the index to retrieve them.
-    """
-    storage_id_to_tensors: Dict[Any, List[str]] = {}
-
-    shard_list: List[Dict[str, TensorT]] = []
-    current_shard: Dict[str, TensorT] = {}
-    current_shard_size = 0
-    total_size = 0
-
-    if isinstance(max_shard_size, str):
-        max_shard_size = parse_size_to_int(max_shard_size)
-
-    for key, tensor in state_dict.items():
-        # when bnb serialization is used the weights in the state dict can be strings
-        # check: https://github.com/huggingface/transformers/pull/24416 for more details
-        if isinstance(tensor, str):
-            logger.info("Skipping tensor %s as it is a string (bnb serialization)", key)
-            continue
-
-        # If a `tensor` shares the same underlying storage as another tensor, we put `tensor` in the same `block`
-        storage_id = get_storage_id(tensor)
-        if storage_id is not None:
-            if storage_id in storage_id_to_tensors:
-                # We skip this tensor for now and will reassign to correct shard later
-                storage_id_to_tensors[storage_id].append(key)
-                continue
-            # This is the first tensor with this storage_id, we create a new entry
-            # in the storage_id_to_tensors dict => we will assign the shard id later
-            storage_id_to_tensors[storage_id] = [key]
-
-        # Compute tensor size
-        tensor_size = get_tensor_size(tensor)
-
-        # If this tensor is bigger than the maximal size, we put it in its own shard
-        if tensor_size > max_shard_size:
-            total_size += tensor_size
-            shard_list.append({key: tensor})
-            continue
-
-        # If this tensor is going to tip up over the maximal size, we split.
-        # Current shard already has some tensors, we add it to the list of shards and create a new one.
-        if current_shard_size + tensor_size > max_shard_size:
-            shard_list.append(current_shard)
-            current_shard = {}
-            current_shard_size = 0
-
-        # Add the tensor to the current shard
-        current_shard[key] = tensor
-        current_shard_size += tensor_size
-        total_size += tensor_size
-
-    # Add the last shard
-    if len(current_shard) > 0:
-        shard_list.append(current_shard)
-    nb_shards = len(shard_list)
-
-    # Loop over the tensors that share the same storage and assign them together
-    for storage_id, keys in storage_id_to_tensors.items():
-        # Let's try to find the shard where the first tensor of this storage is and put all tensors in the same shard
-        for shard in shard_list:
-            if keys[0] in shard:
-                for key in keys:
-                    shard[key] = state_dict[key]
-                break
-
-    # If we only have one shard, we return it => no need to build the index
-    if nb_shards == 1:
-        filename = filename_pattern.format(suffix="")
-        return StateDictSplit(
-            metadata={"total_size": total_size},
-            filename_to_tensors={filename: list(state_dict.keys())},
-            tensor_to_filename={key: filename for key in state_dict.keys()},
-        )
-
-    # Now that each tensor is assigned to a shard, let's assign a filename to each shard
-    tensor_name_to_filename = {}
-    filename_to_tensors = {}
-    for idx, shard in enumerate(shard_list):
-        filename = filename_pattern.format(suffix=f"-{idx+1:05d}-of-{nb_shards:05d}")
-        for key in shard:
-            tensor_name_to_filename[key] = filename
-        filename_to_tensors[filename] = list(shard.keys())
-
-    # Build the index and return
-    return StateDictSplit(
-        metadata={"total_size": total_size},
-        filename_to_tensors=filename_to_tensors,
-        tensor_to_filename=tensor_name_to_filename,
-    )
-
-
-SIZE_UNITS = {
-    "TB": 10**12,
-    "GB": 10**9,
-    "MB": 10**6,
-    "KB": 10**3,
-}
-
-
-def parse_size_to_int(size_as_str: str) -> int:
-    """
-    Parse a size expressed as a string with digits and unit (like `"5MB"`) to an integer (in bytes).
-
-    Supported units are "TB", "GB", "MB", "KB".
-
-    Args:
-        size_as_str (`str`): The size to convert. Will be directly returned if an `int`.
-
-    Example:
-
-    ```py
-    >>> parse_size_to_int("5MB")
-    5000000
-    ```
-    """
-    size_as_str = size_as_str.strip()
-
-    # Parse unit
-    unit = size_as_str[-2:].upper()
-    if unit not in SIZE_UNITS:
-        raise ValueError(f"Unit '{unit}' not supported. Supported units are TB, GB, MB, KB. Got '{size_as_str}'.")
-    multiplier = SIZE_UNITS[unit]
-
-    # Parse value
-    try:
-        value = float(size_as_str[:-2].strip())
-    except ValueError as e:
-        raise ValueError(f"Could not parse the size value from '{size_as_str}': {e}") from e
-
-    return int(value * multiplier)
-
-def split_state_dict_into_shards(
-    state_dict: Dict[str, "mindspore.Tensor"],
-    *,
-    filename_pattern: str = FILENAME_PATTERN,
-    max_shard_size: Union[int, str] = MAX_SHARD_SIZE,
-) -> StateDictSplit:
-    """
-    Split a model state dictionary in shards so that each shard is smaller than a given size.
-
-    The shards are determined by iterating through the `state_dict` in the order of its keys. There is no optimization
-    made to make each shard as close as possible to the maximum size passed. For example, if the limit is 10GB and we
-    have tensors of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], [6+2+2GB] and not
-    [6+2+2GB], [6+2GB], [6GB].
-
-    <Tip warning={true}>
-
-    If one of the model's tensor is bigger than `max_shard_size`, it will end up in its own shard which will have a
-    size greater than `max_shard_size`.
-
-    </Tip>
-
-    Args:
-        state_dict (`Dict[str, torch.Tensor]`):
-            The state dictionary to save.
-        filename_pattern (`str`, *optional*):
-            The pattern to generate the files names in which the model will be saved. Pattern must be a string that
-            can be formatted with `filename_pattern.format(suffix=...)` and must contain the keyword `suffix`
-            Defaults to `"model{suffix}.safetensors"`.
-        max_shard_size (`int` or `str`, *optional*):
-            The maximum size of each shard, in bytes. Defaults to 5GB.
-
-    Returns:
-        [`StateDictSplit`]: A `StateDictSplit` object containing the shards and the index to retrieve them.
-
-    Example:
-    ```py
-    >>> import json
-    >>> import os
-    >>> from safetensors.torch import save_file as safe_save_file
-    >>> from huggingface_hub import split_torch_state_dict_into_shards
-
-    >>> def save_state_dict(state_dict: Dict[str, torch.Tensor], save_directory: str):
-    ...     state_dict_split = split_torch_state_dict_into_shards(state_dict)
-    ...     for filename, tensors in state_dict_split.filename_to_tensors.items():
-    ...         shard = {tensor: state_dict[tensor] for tensor in tensors}
-    ...         safe_save_file(
-    ...             shard,
-    ...             os.path.join(save_directory, filename),
-    ...             metadata={"format": "pt"},
-    ...         )
-    ...     if state_dict_split.is_sharded:
-    ...         index = {
-    ...             "metadata": state_dict_split.metadata,
-    ...             "weight_map": state_dict_split.tensor_to_filename,
-    ...         }
-    ...         with open(os.path.join(save_directory, "model.safetensors.index.json"), "w") as f:
-    ...             f.write(json.dumps(index, indent=2))
-    ```
-    """
-    return split_state_dict_into_shards_factory(
-        state_dict,
-        max_shard_size=max_shard_size,
-        filename_pattern=filename_pattern,
-        get_tensor_size=get_tensor_size,
-        get_storage_id=get_storage_id,
-    )
-
-
-def get_storage_id(tensor: "mindspore.Tensor"):
-    """
-    Return unique identifier to a tensor storage.
-
-    Multiple different tensors can share the same underlying storage. For
-    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
-    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
-    non-overlapping lifetimes may have the same id.
-
-    Taken from https://github.com/huggingface/transformers/blob/1ecf5f7c982d761b4daaa96719d162c324187c64/src/transformers/pytorch_utils.py#L278.
-    """
-    return id(tensor)
-
-def get_tensor_size(tensor: "mindspore.Tensor") -> int:
-    return tensor.numel() * tensor.itemsize
diff --git a/mindnlp/utils/testing_utils.py b/mindnlp/utils/testing_utils.py
deleted file mode 100644
index 13d4b4f1d..000000000
--- a/mindnlp/utils/testing_utils.py
+++ /dev/null
@@ -1,2103 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Utils for test cases."""
-import collections
-import contextlib
-import doctest
-import functools
-import inspect
-import logging
-import multiprocessing
-import os
-import re
-import shlex
-import shutil
-import subprocess
-import sys
-import tempfile
-import time
-import unittest
-import asyncio
-from collections.abc import Mapping
-from collections import defaultdict
-
-from io import StringIO
-from pathlib import Path
-from typing import Callable, Dict, Iterable, Iterator, List, Optional, Union
-from unittest import mock
-from unittest.mock import patch
-
-import urllib3
-import numpy as np
-
-import mindspore
-from mindnlp.utils import logging as mindnlp_logging
-from mindnlp.configs import SUPPORT_BF16
-
-from .import_utils import (
-    is_pytest_available,
-    is_mindspore_available,
-    is_essentia_available,
-    is_librosa_available,
-    is_pretty_midi_available,
-    is_scipy_available,
-    is_pyctcdecode_available,
-    is_safetensors_available,
-    is_sentencepiece_available,
-    is_soundfile_availble,
-    is_tokenizers_available,
-    is_pytesseract_available,
-    is_vision_available,
-    is_g2p_en_available,
-    is_levenshtein_available,
-    is_nltk_available,
-    is_ftfy_available
-)
-from .generic import strtobool
-
-if is_pytest_available():
-    from _pytest.doctest import (
-        Module,
-        _get_checker,
-        _get_continue_on_failure,
-        _get_runner,
-        _is_mocked,
-        _patch_unwrap_mock_aware,
-        get_optionflags,
-        import_path,
-    )
-    from _pytest.config import create_terminal_writer
-    from _pytest.outcomes import skip
-    from pytest import DoctestItem
-else:
-    Module = object
-    DoctestItem = object
-
-if is_mindspore_available():
-    from mindspore import ops
-
-
-DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown"
-SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
-
-def is_pipeline_test(test_case):
-    """
-    Decorator marking a test as a pipeline test. If RUN_PIPELINE_TESTS is set to a falsy value, those tests will be
-    skipped.
-    """
-    if not _run_pipeline_tests:
-        return unittest.skip("test is pipeline test")(test_case)
-    else:
-        try:
-            import pytest  # We don't need a hard dependency on pytest in the main library
-        except ImportError:
-            return test_case
-        else:
-            return pytest.mark.is_pipeline_test()(test_case)
-
-def parse_flag_from_env(key, default=False):
-    """
-    Parses a flag value from the environment variable.
-    
-    Args:
-        key (str): The name of the environment variable to retrieve the flag value from.
-        default (bool, optional): The default flag value to return if the environment variable is not set. Defaults to False.
-    
-    Returns:
-        bool: The parsed flag value. Returns the default value if the environment variable is not set or if its value cannot be parsed.
-    
-    Raises:
-        ValueError: If the environment variable value is set but cannot be parsed as a boolean ('yes' or 'no').
-    
-    Note:
-        The flag value is retrieved from the environment variable specified by `key`. If the environment variable is not set, the default value is returned. If the environment variable value is set, it is
-parsed as a boolean using the `strtobool` function from the `distutils.util` module. If the parsing fails, a `ValueError` is raised with a descriptive error message indicating that the value must be either
-'yes' or 'no'.
-    """
-    try:
-        value = os.environ[key]
-    except KeyError:
-        # KEY isn't set, default to `default`.
-        _value = default
-    else:
-        # KEY is set, convert it to True or False.
-        try:
-            _value = strtobool(value)
-        except ValueError as exc:
-            # More values are supported, but let's keep the message simple.
-            raise ValueError(f"If set, {key} must be yes or no.") from exc
-    return _value
-
-_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
-_run_too_slow_tests = parse_flag_from_env("RUN_TOO_SLOW", default=False)
-_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
-
-def slow(test_case):
-    """
-    Decorator marking a test as slow.
-
-    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
-
-    """
-    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
-
-def tooslow(test_case):
-    """
-    Decorator marking a test as too slow.
-
-    Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as
-    these will not be tested by the CI.
-
-    """
-    return unittest.skipUnless(_run_too_slow_tests, "test is too slow")(test_case)
-
-def parse_int_from_env(key, default=None):
-    """Parses an integer value from the specified environment variable.
-    
-    Args:
-        key (str): The name of the environment variable to retrieve the integer value from.
-        default (int, optional): The default integer value to return if the environment variable is not set or cannot be converted to an integer. Defaults to None.
-    
-    Returns:
-        int or None: The integer value parsed from the environment variable or the default value if provided. Returns None if the environment variable is not set and no default value is specified.
-    
-    Raises:
-        ValueError: If the value retrieved from the environment variable cannot be converted to an integer.
-    """
-    try:
-        value = os.environ[key]
-    except KeyError:
-        _value = default
-    else:
-        try:
-            _value = int(value)
-        except ValueError as exc:
-            raise ValueError(f"If set, {key} must be a int.") from exc
-    return _value
-
-
-def require_ftfy(test_case):
-    """
-    Decorator marking a test that requires ftfy. These tests are skipped when ftfy isn't installed.
-    """
-    return unittest.skipUnless(is_ftfy_available(), "test requires ftfy")(test_case)
-
-
-def require_levenshtein(test_case):
-    """
-    Decorator marking a test that requires Levenshtein.
-
-    These tests are skipped when Levenshtein isn't installed.
-
-    """
-    return unittest.skipUnless(is_levenshtein_available(), "test requires Levenshtein")(test_case)
-
-
-def require_nltk(test_case):
-    """
-    Decorator marking a test that requires NLTK.
-
-    These tests are skipped when NLTK isn't installed.
-
-    """
-    return unittest.skipUnless(is_nltk_available(), "test requires NLTK")(test_case)
-
-
-def require_vision(test_case):
-    """
-    Decorator marking a test that requires the vision dependencies. These tests are skipped when torchaudio isn't
-    installed.
-    """
-    return unittest.skipUnless(is_vision_available(), "test requires vision")(test_case)
-
-def require_tokenizers(test_case):
-    """
-    Decorator marking a test that requires 🤗 Tokenizers. These tests are skipped when 🤗 Tokenizers isn't installed.
-    """
-    return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(test_case)
-
-def require_sentencepiece(test_case):
-    """
-    Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
-    """
-    return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)
-
-def require_mindspore(test_case):
-    """
-    Decorator marking a test that requires MindSpore.
-
-    These tests are skipped when MindSpore isn't installed.
-
-    """
-    return unittest.skipUnless(is_mindspore_available(), "test requires MindSpore")(test_case)
-
-def require_bfloat16(test_case):
-    """require_bfloat16"""
-    return unittest.skipUnless(SUPPORT_BF16, "test need bfloat16")(test_case)
-
-def require_mindspore_gpu(test_case):
-    """Decorator marking a test that requires CUDA and MindSpore."""
-    return unittest.skipUnless(mindspore.get_context('device_target') == "GPU", "test requires CUDA")(test_case)
-
-def require_mindspore_npu(test_case):
-    """Decorator marking a test that requires CANN and MindSpore."""
-    return unittest.skipUnless(mindspore.get_context('device_target') == "Ascend", "test requires CANN")(test_case)
-
-
-def require_librosa(test_case):
-    """
-    Decorator marking a test that requires librosa
-    """
-    return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case)
-
-def require_essentia(test_case):
-    """
-    Decorator marking a test that requires essentia
-    """
-    return unittest.skipUnless(is_essentia_available(), "test requires essentia")(test_case)
-
-def require_pretty_midi(test_case):
-    """
-    Decorator marking a test that requires pretty_midi
-    """
-    return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(test_case)
-
-def require_scipy(test_case):
-    """
-    Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed.
-    """
-    return unittest.skipUnless(is_scipy_available(), "test requires Scipy")(test_case)
-
-def require_pyctcdecode(test_case):
-    """
-    Decorator marking a test that requires pyctcdecode
-    """
-    return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case)
-
-def require_safetensors(test_case):
-    """
-    Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed.
-    """
-    return unittest.skipUnless(is_safetensors_available(), "test requires safetensors")(test_case)
-
-def require_pytesseract(test_case):
-    """
-    Decorator marking a test that requires pytesseract
-    """
-    return unittest.skipUnless(is_pytesseract_available(), "test requires pytesseract")(test_case)
-
-def require_g2p_en(test_case):
-    """
-    Decorator marking a test that requires pytesseract
-    """
-    return unittest.skipUnless(is_g2p_en_available(), "test requires g2p-en")(test_case)
-
-
-def cmd_exists(cmd):
-    """
-    Check if a command exists in the system PATH.
-    
-    Args:
-        cmd (str): The name of the command to check for existence in the system PATH.
-    
-    Returns:
-        None: Returns None if the command exists in the system PATH, otherwise returns False.
-    
-    Raises:
-        None.
-    """
-    return shutil.which(cmd) is not None
-#
-# Helper functions for dealing with testing text outputs
-# The original code came from:
-# https://github.com/fastai/fastai/blob/master/tests/utils/text.py
-
-
-# When any function contains print() calls that get overwritten, like progress bars,
-# a special care needs to be applied, since under pytest -s captured output (capsys
-# or contextlib.redirect_stdout) contains any temporary printed strings, followed by
-# \r's. This helper function ensures that the buffer will contain the same output
-# with and without -s in pytest, by turning:
-# foo bar\r tar mar\r final message
-# into:
-# final message
-# it can handle a single string or a multiline buffer
-def apply_print_resets(buf):
-    """
-    Apply print resets by removing any characters before the last carriage return in the given buffer.
-    
-    Args:
-        buf (str): The input buffer containing text data.
-        
-    Returns:
-        None. The function modifies the buffer in place.
-    
-    Raises:
-        None.
-    """
-    return re.sub(r"^.*\r", "", buf, 0, re.M)
-
-
-def assert_screenout(out, what):
-    """
-    This function asserts the presence of a specified string within the provided output.
-    
-    Args:
-        out (str): The output string to be checked for the presence of the specified string.
-        what (str): The string to be searched for within the output.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        AssertionError: If the specified string 'what' is not found within the output string 'out'.
-    """
-    out_pr = apply_print_resets(out).lower()
-    match_str = out_pr.find(what.lower())
-    assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
-
-
-class CaptureStd:
-    """
-    Context manager to capture:
-
-        - stdout: replay it, clean it up and make it available via `obj.out`
-        - stderr: replay it and make it available via `obj.err`
-
-    Args:
-        out (`bool`, *optional*, defaults to `True`): Whether to capture stdout or not.
-        err (`bool`, *optional*, defaults to `True`): Whether to capture stderr or not.
-        replay (`bool`, *optional*, defaults to `True`): Whether to replay or not.
-            By default each captured stream gets replayed back on context's exit, so that one can see what the test was
-            doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass `replay=False` to
-            disable this feature.
-
-    Examples:
-
-    ```python
-    # to capture stdout only with auto-replay
-    with CaptureStdout() as cs:
-        print("Secret message")
-    assert "message" in cs.out
-
-    # to capture stderr only with auto-replay
-    import sys
-
-    with CaptureStderr() as cs:
-        print("Warning: ", file=sys.stderr)
-    assert "Warning" in cs.err
-
-    # to capture both streams with auto-replay
-    with CaptureStd() as cs:
-        print("Secret message")
-        print("Warning: ", file=sys.stderr)
-    assert "message" in cs.out
-    assert "Warning" in cs.err
-
-    # to capture just one of the streams, and not the other, with auto-replay
-    with CaptureStd(err=False) as cs:
-        print("Secret message")
-    assert "message" in cs.out
-    # but best use the stream-specific subclasses
-
-    # to capture without auto-replay
-    with CaptureStd(replay=False) as cs:
-        print("Secret message")
-    assert "message" in cs.out
-    ```"""
-    def __init__(self, out=True, err=True, replay=True):
-        """Initialize a CaptureStd object.
-        
-        Args:
-            self (CaptureStd): The instance of the CaptureStd class.
-            out (bool): Flag indicating whether to capture stdout. Default is True.
-            err (bool): Flag indicating whether to capture stderr. Default is True.
-            replay (bool): Flag indicating whether to replay captured output. Default is True.
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        
-        This method initializes a CaptureStd object with the given parameters. The 'out' parameter determines whether to capture stdout, while the 'err' parameter determines whether to capture stderr. By
-default, both 'out' and 'err' are set to True. If 'out' is True, a StringIO object is created to capture stdout. If 'out' is False, stdout is not captured and the 'out' attribute is set to 'not capturing
-stdout'. The same logic applies to 'err' and stderr.
-        
-        The 'replay' parameter determines whether the captured output should be replayed. By default, 'replay' is set to True.
-        
-        Note: If 'out' or 'err' is set to True, but the CaptureStd context is not finished yet (i.e., __exit__ is not called), an error message is set to the corresponding attribute indicating that the context
-was called too early.
-        """
-        self.replay = replay
-
-        if out:
-            self.out_buf = StringIO()
-            self.out = "error: CaptureStd context is unfinished yet, called too early"
-        else:
-            self.out_buf = None
-            self.out = "not capturing stdout"
-
-        if err:
-            self.err_buf = StringIO()
-            self.err = "error: CaptureStd context is unfinished yet, called too early"
-        else:
-            self.err_buf = None
-            self.err = "not capturing stderr"
-
-    def __enter__(self):
-        """
-        The '__enter__' method is used as a context manager to redirect the standard output and standard error streams to the provided buffers.
-        
-        Args:
-            self: An instance of the 'CaptureStd' class.
-        
-        Returns:
-            None. This method does not return any value explicitly.
-        
-        Raises:
-            None.
-        """
-        if self.out_buf:
-            self.out_old = sys.stdout
-            sys.stdout = self.out_buf
-
-        if self.err_buf:
-            self.err_old = sys.stderr
-            sys.stderr = self.err_buf
-
-        return self
-
-    def __exit__(self, *exc):
-        """
-        This method __exit__ is called automatically when exiting a 'with' block that uses the CaptureStd context manager.
-        
-        Args:
-            self: An instance of the CaptureStd class that represents the current context manager. It is used to access the attributes and buffers within the context manager.
-        
-        Returns:
-            None. The method does not explicitly return a value.
-        
-        Raises:
-            This method does not raise any exceptions explicitly. However, exceptions may be raised if there are errors during the execution of the code within the method.
-        """
-        if self.out_buf:
-            sys.stdout = self.out_old
-            captured = self.out_buf.getvalue()
-            if self.replay:
-                sys.stdout.write(captured)
-            self.out = apply_print_resets(captured)
-
-        if self.err_buf:
-            sys.stderr = self.err_old
-            captured = self.err_buf.getvalue()
-            if self.replay:
-                sys.stderr.write(captured)
-            self.err = captured
-
-    def __repr__(self):
-        """
-        Returns a string representation of the CaptureStd object.
-        
-        Args:
-            self: The instance of the CaptureStd class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        Description:
-            The __repr__ method is called when the repr() function is used on an instance of the CaptureStd class. It generates a string representation of the object, which includes the captured stdout and
-stderr outputs, if any. The generated string representation is returned by the method.
-        
-            This method checks if the 'out_buf' attribute of the CaptureStd object is not empty. If it is not empty, the captured stdout output is added to the message string. Similarly, if the 'err_buf'
-attribute is not empty, the captured stderr output is added to the message string. The final message string is then returned by the method.
-        
-            Note that the stdout and stderr outputs are represented as 'stdout: <output>' and 'stderr: <output>' respectively in the message string.
-        
-        Example Usage:
-            capture = CaptureStd()
-            capture.capture_stdout('Hello, world!')
-            capture.capture_stderr('Oops, an error occurred.')
-            repr_str = repr(capture)
-            print(repr_str)
-            # Output: "stdout: Hello, world!\nstderr: Oops, an error occurred.\n"
-        """
-        msg = ""
-        if self.out_buf:
-            msg += f"stdout: {self.out}\n"
-        if self.err_buf:
-            msg += f"stderr: {self.err}\n"
-        return msg
-
-
-# in tests it's the best to capture only the stream that's wanted, otherwise
-# it's easy to miss things, so unless you need to capture both streams, use the
-# subclasses below (less typing). Or alternatively, configure `CaptureStd` to
-# disable the stream you don't need to test.
-
-
-class CaptureStdout(CaptureStd):
-    """Same as CaptureStd but captures only stdout"""
-    def __init__(self, replay=True):
-        """
-        Initializes an instance of the CaptureStdout class.
-        
-        Args:
-            self: The instance of the class.
-            replay (bool): A boolean flag indicating whether the captured output should be replayed. 
-                           Defaults to True. If set to True, the captured output will be replayed.
-                           If set to False, the captured output will not be replayed.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        super().__init__(err=False, replay=replay)
-
-
-class CaptureStderr(CaptureStd):
-    """Same as CaptureStd but captures only stderr"""
-    def __init__(self, replay=True):
-        """
-        Initializes an instance of the CaptureStderr class.
-        
-        Args:
-            self (CaptureStderr): The current object.
-            replay (bool): Indicates whether to replay the captured stderr output. Default is True.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None. This method does not raise any exceptions.
-        """
-        super().__init__(out=False, replay=replay)
-
-
-class CaptureLogger:
-    """
-     Context manager to capture `logging` streams
-
-     Args:
-         logger: 'logging` logger object
-
-     Returns:
-         The captured output is available via `self.out`
-
-     Example:
-
-     ```python
-     >>> from transformers import logging
-     >>> from transformers.testing_utils import CaptureLogger
-
-     >>> msg = "Testing 1, 2, 3"
-     >>> logging.set_verbosity_info()
-     >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-     >>> with CaptureLogger(logger) as cl:
-     ...     logger.info(msg)
-     >>> assert cl.out, msg + "\n"
-     ```
-     """
-    def __init__(self, logger):
-        """
-        Initializes a new instance of the CaptureLogger class.
-        
-        Args:
-            self: The instance of the class.
-            logger: An object representing the logger to be used for capturing logs. It should be an instance of a logger class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None. This method does not raise any exceptions.
-        """
-        self.logger = logger
-        self.io = StringIO()
-        self.sh = logging.StreamHandler(self.io)
-        self.out = ""
-
-    def __enter__(self):
-        """
-        This method is an implementation of the context manager protocol for the CaptureLogger class.
-        
-        Args:
-            self: An instance of the CaptureLogger class. It represents the current object that the method is being called upon.
-        
-        Returns:
-            None. The method does not explicitly return any value, but it adds a handler to the logger associated with the CaptureLogger instance.
-        
-        Raises:
-            This method does not raise any exceptions under normal circumstances. However, potential exceptions could be raised if there are issues with adding the handler to the logger, such as improper
-configuration of the logging system.
-        """
-        self.logger.addHandler(self.sh)
-        return self
-
-    def __exit__(self, *exc):
-        """
-        This method __exit__ is called automatically when exiting a 'with' block in the CaptureLogger class.
-        
-        Args:
-            self (CaptureLogger): An instance of the CaptureLogger class. It is used to access the logger and the captured output.
-            
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions explicitly. However, exceptions may be raised internally if there are issues with removing the handler or getting the captured output.
-        """
-        self.logger.removeHandler(self.sh)
-        self.out = self.io.getvalue()
-
-    def __repr__(self):
-        """
-        Return a string representation of the CaptureLogger object.
-        
-        Args:
-            self (CaptureLogger): The instance of the CaptureLogger class.
-        
-        Returns:
-            None: This method does not explicitly return any value, as it returns None.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return f"captured: {self.out}\n"
-
-
-@contextlib.contextmanager
-def LoggingLevel(level):
-    """
-    This is a context manager to temporarily change transformers modules logging level to the desired value and have it
-    restored to the original setting at the end of the scope.
-
-    Example:
-
-    ```python
-    with LoggingLevel(logging.INFO):
-        AutoModel.from_pretrained("gpt2")  # calls logger.info() several times
-    ```
-    """
-    orig_level = mindnlp_logging.get_verbosity()
-    try:
-        mindnlp_logging.set_verbosity(level)
-        yield
-    finally:
-        mindnlp_logging.set_verbosity(orig_level)
-
-
-@contextlib.contextmanager
-# adapted from https://stackoverflow.com/a/64789046/9201239
-def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
-    """
-    Temporary add given path to `sys.path`.
-
-    Usage :
-
-    ```python
-    with ExtendSysPath("/path/to/dir"):
-        mymodule = importlib.import_module("mymodule")
-    ```
-    """
-    path = os.fspath(path)
-    try:
-        sys.path.insert(0, path)
-        yield
-    finally:
-        sys.path.remove(path)
-
-
-class TestCasePlus(unittest.TestCase):
-    """
-    This class extends *unittest.TestCase* with additional features.
-
-    Feature 1: A set of fully resolved important file and dir path accessors.
-
-    In tests often we need to know where things are relative to the current test file, and it's not trivial since the
-    test could be invoked from more than one directory or could reside in sub-directories with different depths. This
-    class solves this problem by sorting out all the basic paths and provides easy accessors to them:
-
-    - `pathlib` objects (all fully resolved):
-
-       - `test_file_path` - the current test file path (=`__file__`)
-       - `test_file_dir` - the directory containing the current test file
-       - `tests_dir` - the directory of the `tests` test suite
-       - `examples_dir` - the directory of the `examples` test suite
-       - `repo_root_dir` - the directory of the repository
-       - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides)
-
-    - stringified paths---same as above but these return paths as strings, rather than `pathlib` objects:
-
-       - `test_file_path_str`
-       - `test_file_dir_str`
-       - `tests_dir_str`
-       - `examples_dir_str`
-       - `repo_root_dir_str`
-       - `src_dir_str`
-
-    Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test.
-
-    1. Create a unique temporary dir:
-
-    ```python
-    def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir()
-    ```
-
-    `tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the
-    test.
-
-
-    2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't
-    empty it after the test.
-
-    ```python
-    def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
-    ```
-
-    This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests
-    didn't leave any data in there.
-
-    3. You can override the first two options by directly overriding the `before` and `after` args, leading to the
-        following behavior:
-
-    `before=True`: the temporary dir will always be cleared at the beginning of the test.
-
-    `before=False`: if the temporary dir already existed, any existing files will remain there.
-
-    `after=True`: the temporary dir will always be deleted at the end of the test.
-
-    `after=False`: the temporary dir will always be left intact at the end of the test.
-
-    Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are
-    allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem
-    will get nuked. i.e. please always pass paths that start with `./`
-
-    Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested
-    otherwise.
-
-    Feature 3: Get a copy of the `os.environ` object that sets up `PYTHONPATH` specific to the current test suite. This
-    is useful for invoking external programs from the test suite - e.g. distributed training.
-
-
-    ```python
-    def test_whatever(self):
-        env = self.get_env()
-    ```"""
-    def setUp(self):
-        """
-        Set up the necessary environment for the TestCasePlus class.
-        
-        Args:
-            self: The instance of the TestCasePlus class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If the root directory of the repository cannot be determined from the test file path.
-        
-        Description:
-        This method is called before each test case to set up the required environment for the TestCasePlus class. It initializes various directories and paths based on the current test file's location. The
-method performs the following steps:
-        
-        1. Sets up a list to keep track of temporary directories that need to be cleaned up later.
-        2. Retrieves the path of the test file using the inspect module.
-        3. Resolves the absolute path of the test file.
-        4. Determines the parent directory of the test file.
-        5. Checks if there are 'src' and 'tests' directories in any of the parent directories up to three levels above the test file.
-        6. If such directories are found, the loop breaks and the repository root directory is set as the temporary directory.
-        7. If no valid temporary directory is found, a ValueError is raised indicating that the root directory of the repository could not be determined.
-        8. Sets the paths for the 'tests', 'examples', and 'src' directories within the repository root directory.
-        
-        Note:
-        This method assumes a specific directory structure for the repository, where 'src' and 'tests' directories exist at an appropriate level above the test file.
-        
-        Example usage:
-            test_case = TestCasePlus()
-            test_case.setUp()
-        """
-        # get_auto_remove_tmp_dir feature:
-        self.teardown_tmp_dirs = []
-
-        # figure out the resolved paths for repo_root, tests, examples, etc.
-        self._test_file_path = inspect.getfile(self.__class__)
-        path = Path(self._test_file_path).resolve()
-        self._test_file_dir = path.parents[0]
-        for up in [1, 2, 3]:
-            tmp_dir = path.parents[up]
-            if (tmp_dir / "src").is_dir() and (tmp_dir / "tests").is_dir():
-                break
-        if tmp_dir:
-            self._repo_root_dir = tmp_dir
-        else:
-            raise ValueError(f"can't figure out the root of the repo from {self._test_file_path}")
-        self._tests_dir = self._repo_root_dir / "tests"
-        self._examples_dir = self._repo_root_dir / "examples"
-        self._src_dir = self._repo_root_dir / "src"
-
-    @property
-    def test_file_path(self):
-        """
-        Returns the test file path.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-        
-        Returns:
-            None. The method does not return any value.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        return self._test_file_path
-
-    @property
-    def test_file_path_str(self):
-        """
-        Method to retrieve the string representation of the test file path.
-        
-        Args:
-            self: Instance of the TestCasePlus class.
-                - Type: object
-                - Purpose: Represents the current instance of the class.
-                - Restrictions: None
-        
-        Returns:
-            The method returns a string representing the test file path.
-                - Type: str
-                - Purpose: Provides the string representation of the test file path.
-        
-        Raises:
-            No exceptions are raised by this method.
-        """
-        return str(self._test_file_path)
-
-    @property
-    def test_file_dir(self):
-        """
-        This method retrieves the directory path where test files are located.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-                This parameter refers to the current instance of the TestCasePlus class.
-        
-        Returns:
-            None. The method does not return any value explicitly but retrieves and returns the test file directory path.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        return self._test_file_dir
-
-    @property
-    def test_file_dir_str(self):
-        """
-        Method test_file_dir_str in the class TestCasePlus.
-        
-        Args:
-            self: Represents the instance of the class. No additional parameters are required.
-        
-        Returns:
-            str: A string representation of the _test_file_dir attribute of the instance.
-        
-        Raises:
-            None.
-        """
-        return str(self._test_file_dir)
-
-    @property
-    def tests_dir(self):
-        """
-        Method: tests_dir
-        
-        Description:
-        Returns the tests directory path used by the TestCasePlus class.
-        
-        Args:
-        - self (object): The instance of the TestCasePlus class.
-        
-        Returns:
-        - None: This method does not return any value explicitly.
-        
-        Raises:
-        - None
-        """
-        return self._tests_dir
-
-    @property
-    def tests_dir_str(self):
-        """
-        Returns the tests directory as a string.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-        
-        Returns:
-            str: The tests directory path converted to a string.
-        
-        Raises:
-            None.
-        
-        This method returns the tests directory path as a string. The tests directory is obtained from the '_tests_dir' attribute of the TestCasePlus class. The returned string represents the absolute path of
-the tests directory.
-        
-        Example usage:
-            >>> test_case = TestCasePlus()
-            >>> test_case.tests_dir_str()
-            '/path/to/tests/directory'
-        """
-        return str(self._tests_dir)
-
-    @property
-    def examples_dir(self):
-        """
-        Method to get the examples directory path.
-        
-        Args:
-            self: The instance of the class.
-            
-        Returns:
-            None. The method returns the examples directory path.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        return self._examples_dir
-
-    @property
-    def examples_dir_str(self):
-        """
-        Method examples_dir_str in the class TestCasePlus returns the string representation of the _examples_dir attribute.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-                Purpose: Represents the current instance of the class.
-                Restrictions: None.
-        
-        Returns:
-            str: A string representation of the _examples_dir attribute.
-                Purpose: Provides a human-readable string representation of the _examples_dir attribute.
-        
-        Raises:
-            None.
-        """
-        return str(self._examples_dir)
-
-    @property
-    def repo_root_dir(self):
-        """
-        Method to retrieve the root directory of the repository.
-        
-        Args:
-            self (TestCasePlus): The instance of the TestCasePlus class.
-                This parameter is required to access the instance attributes and methods.
-        
-        Returns:
-            None. The method returns the value of the '_repo_root_dir' attribute of the instance.
-        
-        Raises:
-            This method does not raise any exceptions.
-        """
-        return self._repo_root_dir
-
-    @property
-    def repo_root_dir_str(self):
-        """
-        Method to retrieve the repository root directory as a string.
-        
-        Args:
-            self: The instance of the class TestCasePlus.
-                This parameter is automatically passed and refers to the instance itself.
-        
-        Returns:
-            str: A string representing the repository root directory.
-                This method returns the repository root directory as a string.
-        
-        Raises:
-            None.
-        """
-        return str(self._repo_root_dir)
-
-    @property
-    def src_dir(self):
-        """
-        Returns the source directory path for the TestCasePlus class.
-        
-        Args:
-            self (TestCasePlus): An instance of the TestCasePlus class.
-        
-        Returns:
-            None: The method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        return self._src_dir
-
-    @property
-    def src_dir_str(self):
-        """
-        Method to retrieve the source directory path as a string representation.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-                This parameter refers to the current object instance.
-                It is used to access the source directory path stored in the _src_dir attribute.
-        
-        Returns:
-            None
-            This method returns the source directory path as a string. If the source directory path does not exist or is empty, None is returned.
-        
-        Raises:
-            None
-            This method does not raise any exceptions.
-        """
-        return str(self._src_dir)
-
-    def get_env(self):
-        """
-        Return a copy of the `os.environ` object that sets up `PYTHONPATH` correctly, depending on the test suite it's
-        invoked from. This is useful for invoking external programs from the test suite - e.g. distributed training.
-
-        It always inserts `./src` first, then `./tests` or `./examples` depending on the test suite type and finally
-        the preset `PYTHONPATH` if any (all full resolved paths).
-
-        """
-        env = os.environ.copy()
-        paths = [self.src_dir_str]
-        if "/examples" in self.test_file_dir_str:
-            paths.append(self.examples_dir_str)
-        else:
-            paths.append(self.tests_dir_str)
-        paths.append(env.get("PYTHONPATH", ""))
-
-        env["PYTHONPATH"] = ":".join(paths)
-        return env
-
-    def get_auto_remove_tmp_dir(self, tmp_dir=None, before=None, after=None):
-        """
-        Args:
-            tmp_dir (`string`, *optional*):
-                if `None`:
-
-                   - a unique temporary path will be created
-                   - sets `before=True` if `before` is `None`
-                   - sets `after=True` if `after` is `None`
-                else:
-
-                   - `tmp_dir` will be created
-                   - sets `before=True` if `before` is `None`
-                   - sets `after=False` if `after` is `None`
-            before (`bool`, *optional*):
-                If `True` and the `tmp_dir` already exists, make sure to empty it right away if `False` and the
-                `tmp_dir` already exists, any existing files will remain there.
-            after (`bool`, *optional*):
-                If `True`, delete the `tmp_dir` at the end of the test if `False`, leave the `tmp_dir` and its contents
-                intact at the end of the test.
-
-        Returns:
-            tmp_dir(`string`): either the same value as passed via *tmp_dir* or the path to the auto-selected tmp dir
-        """
-        if tmp_dir is not None:
-            # defining the most likely desired behavior for when a custom path is provided.
-            # this most likely indicates the debug mode where we want an easily locatable dir that:
-            # 1. gets cleared out before the test (if it already exists)
-            # 2. is left intact after the test
-            if before is None:
-                before = True
-            if after is None:
-                after = False
-
-            # using provided path
-            path = Path(tmp_dir).resolve()
-
-            # to avoid nuking parts of the filesystem, only relative paths are allowed
-            if not tmp_dir.startswith("./"):
-                raise ValueError(
-                    f"`tmp_dir` can only be a relative path, i.e. `./some/path`, but received `{tmp_dir}`"
-                )
-
-            # ensure the dir is empty to start with
-            if before is True and path.exists():
-                shutil.rmtree(tmp_dir, ignore_errors=True)
-
-            path.mkdir(parents=True, exist_ok=True)
-
-        else:
-            # defining the most likely desired behavior for when a unique tmp path is auto generated
-            # (not a debug mode), here we require a unique tmp dir that:
-            # 1. is empty before the test (it will be empty in this situation anyway)
-            # 2. gets fully removed after the test
-            if before is None:
-                before = True
-            if after is None:
-                after = True
-
-            # using unique tmp dir (always empty, regardless of `before`)
-            tmp_dir = tempfile.mkdtemp()
-
-        if after is True:
-            # register for deletion
-            self.teardown_tmp_dirs.append(tmp_dir)
-
-        return tmp_dir
-
-    def python_one_liner_max_rss(self, one_liner_str):
-        """
-        Runs the passed python one liner (just the code) and returns how much max cpu memory was used to run the
-        program.
-
-        Args:
-            one_liner_str (`string`):
-                a python one liner code that gets passed to `python -c`
-
-        Returns:
-            max cpu memory bytes used to run the program. This value is likely to vary slightly from run to run.
-
-        Requirements:
-            this helper needs `/usr/bin/time` to be installed (`apt install time`)
-
-        Example:
-
-        ```
-        one_liner_str = 'from transformers import AutoModel; AutoModel.from_pretrained("t5-large")'
-        max_rss = self.python_one_liner_max_rss(one_liner_str)
-        ```
-        """
-        if not cmd_exists("/usr/bin/time"):
-            raise ValueError("/usr/bin/time is required, install with `apt install time`")
-
-        cmd = shlex.split(f"/usr/bin/time -f %M python -c '{one_liner_str}'")
-        with CaptureStd() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-        # returned data is in KB so convert to bytes
-        max_rss = int(cs.err.split("\n")[-2].replace("stderr: ", "")) * 1024
-        return max_rss
-
-    def tearDown(self):
-        """
-        Tears down the test case by cleaning up temporary directories.
-        
-        Args:
-            self (TestCasePlus): The instance of the TestCasePlus class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        # get_auto_remove_tmp_dir feature: remove registered temp dirs
-        for path in self.teardown_tmp_dirs:
-            shutil.rmtree(path, ignore_errors=True)
-        self.teardown_tmp_dirs = []
-
-
-def mockenv(**kwargs):
-    """
-    this is a convenience wrapper, that allows this ::
-
-    @mockenv(RUN_SLOW=True, USE_TF=False) def test_something():
-        run_slow = os.getenv("RUN_SLOW", False) use_tf = os.getenv("USE_TF", False)
-
-    """
-    return mock.patch.dict(os.environ, kwargs)
-
-
-# from https://stackoverflow.com/a/34333710/9201239
-@contextlib.contextmanager
-def mockenv_context(*remove, **update):
-    """
-    Temporarily updates the `os.environ` dictionary in-place. Similar to mockenv
-
-    The `os.environ` dictionary is updated in-place so that the modification is sure to work in all situations.
-
-    Args:
-      remove: Environment variables to remove.
-      update: Dictionary of environment variables and values to add/update.
-    """
-    env = os.environ
-    update = update or {}
-    remove = remove or []
-
-    # List of environment variables being updated or removed.
-    stomped = (set(update.keys()) | set(remove)) & set(env.keys())
-    # Environment variables and values to restore on exit.
-    update_after = {k: env[k] for k in stomped}
-    # Environment variables and values to remove on exit.
-    remove_after = frozenset(k for k in update if k not in env)
-
-    try:
-        env.update(update)
-        for k in remove:
-            env.pop(k, None)
-        yield
-    finally:
-        env.update(update_after)
-        for k in remove_after:
-            env.pop(k)
-
-
-# --- pytest conf functions --- #
-
-# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
-pytest_opt_registered = {}
-
-
-def pytest_addoption_shared(parser):
-    """
-    This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
-
-    It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
-    option.
-
-    """
-    option = "--make-reports"
-    if option not in pytest_opt_registered:
-        parser.addoption(
-            option,
-            action="store",
-            default=False,
-            help="generate report files. The value of this option is used as a prefix to report names",
-        )
-        pytest_opt_registered[option] = 1
-
-
-def pytest_terminal_summary_main(tr, ids):
-    """
-    Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
-    directory. The report files are prefixed with the test suite name.
-
-    This function emulates --duration and -rA pytest arguments.
-
-    This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
-    there.
-
-    Args:
-    - tr: `terminalreporter` passed from `conftest.py`
-    - ids: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
-      needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
-
-    NB: this functions taps into a private _pytest API and while unlikely, it could break should pytest do internal
-    changes - also it calls default internal methods of terminalreporter which can be hijacked by various `pytest-`
-    plugins and interfere.
-
-    """
-    if not ids:
-        ids = "tests"
-
-    config = tr.config
-    orig_writer = config.get_terminal_writer()
-    orig_tbstyle = config.option.tbstyle
-    orig_reportchars = tr.reportchars
-
-    dirs = f"reports/{ids}"
-    Path(dirs).mkdir(parents=True, exist_ok=True)
-    report_files = {
-        k: f"{dirs}/{k}.txt"
-        for k in [
-            "durations",
-            "errors",
-            "failures_long",
-            "failures_short",
-            "failures_line",
-            "passes",
-            "stats",
-            "summary_short",
-            "warnings",
-        ]
-    }
-
-    # custom durations report
-    # note: there is no need to call pytest --durations=XX to get this separate report
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
-    dlist = []
-    for replist in tr.stats.values():
-        for rep in replist:
-            if hasattr(rep, "duration"):
-                dlist.append(rep)
-    if dlist:
-        dlist.sort(key=lambda x: x.duration, reverse=True)
-        with open(report_files["durations"], "w") as f:
-            durations_min = 0.05  # sec
-            f.write("slowest durations\n")
-            for i, rep in enumerate(dlist):
-                if rep.duration < durations_min:
-                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
-                    break
-                f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
-
-    def summary_failures_short(tr):
-        # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
-        reports = tr.getreports("failed")
-        if not reports:
-            return
-        tr.write_sep("=", "FAILURES SHORT STACK")
-        for rep in reports:
-            msg = tr._getfailureheadline(rep)
-            tr.write_sep("_", msg, red=True, bold=True)
-            # chop off the optional leading extra frames, leaving only the last one
-            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
-            tr._tw.line(longrepr)
-            # note: not printing out any rep.sections to keep the report short
-
-    # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
-    # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
-    # pytest-instafail does that)
-
-    # report failures with line/short/long styles
-    config.option.tbstyle = "auto"  # full tb
-    with open(report_files["failures_long"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    # config.option.tbstyle = "short" # short tb
-    with open(report_files["failures_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        summary_failures_short(tr)
-
-    config.option.tbstyle = "line"  # one line per error
-    with open(report_files["failures_line"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    with open(report_files["errors"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_errors()
-
-    with open(report_files["warnings"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_warnings()  # normal warnings
-        tr.summary_warnings()  # final warnings
-
-    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
-
-    # Skip the `passes` report, as it starts to take more than 5 minutes, and sometimes it timeouts on CircleCI if it
-    # takes > 10 minutes (as this part doesn't generate any output on the terminal).
-    # (also, it seems there is no useful information in this report, and we rarely need to read it)
-    # with open(report_files["passes"], "w") as f:
-    #     tr._tw = create_terminal_writer(config, f)
-    #     tr.summary_passes()
-
-    with open(report_files["summary_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.short_test_summary()
-
-    with open(report_files["stats"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_stats()
-
-    # restore:
-    tr._tw = orig_writer
-    tr.reportchars = orig_reportchars
-    config.option.tbstyle = orig_tbstyle
-
-
-# --- distributed testing functions --- #
-
-# adapted from https://stackoverflow.com/a/59041913/9201239
-class _RunOutput:
-
-    """
-    Represents the output of a command execution, including the return code, standard output, and standard error.
-    
-    Attributes:
-        returncode (int): The return code of the executed command.
-        stdout (str): The standard output captured from the command execution.
-        stderr (str): The standard error captured from the command execution.
-    """
-    def __init__(self, returncode, stdout, stderr):
-        """
-        __init__(self, returncode, stdout, stderr)
-        
-        Initializes the _RunOutput class instance with the provided return code, standard output, and standard error.
-        
-        Args:
-            self (_RunOutput): The instance of the _RunOutput class.
-            returncode (int): The return code from the executed command.
-            stdout (str): The standard output generated by the executed command.
-            stderr (str): The standard error generated by the executed command.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        self.returncode = returncode
-        self.stdout = stdout
-        self.stderr = stderr
-
-
-async def _read_stream(stream, callback):
-    """
-    Docstring for _read_stream function:
-    
-    Args:
-        stream (stream): The input stream from which the function reads data.
-        callback (function): The callback function to be executed for each line read from the stream.
-    
-    Returns:
-        None. The function does not return any value.
-    
-    Raises:
-        No specific exceptions are raised by this function.
-    """
-    while True:
-        line = await stream.readline()
-        if line:
-            callback(line)
-        else:
-            break
-
-
-async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
-    """
-    This function runs a subprocess and captures its standard output and error streams.
-    
-    Args:
-    - cmd (List[str]): A list of command and arguments to be executed.
-    - env (Optional[Dict[str, str]]): A dictionary of environment variables to be used for the subprocess.
-    - stdin (Optional[asyncio.subprocess.StreamReader]): A stream representing the standard input for the subprocess.
-    - timeout (Optional[float]): The maximum time in seconds to wait for the subprocess to complete.
-    - quiet (bool): If True, suppresses the output of the subprocess.
-    - echo (bool): If True, prints the command being executed.
-    
-    Returns:
-    _RunOutput: An object containing the return code of the subprocess, its standard output, and standard error.
-    
-    Raises:
-    - asyncio.TimeoutError: If the subprocess execution exceeds the specified timeout.
-    - OSError: If an OS-related error occurs during the subprocess execution.
-    - ValueError: If the provided command is invalid or the arguments are of the wrong type.
-    """
-    if echo:
-        print("\nRunning: ", " ".join(cmd))
-
-    p = await asyncio.create_subprocess_exec(
-        cmd[0],
-        *cmd[1:],
-        stdin=stdin,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        env=env,
-    )
-
-    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
-    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
-    #
-    # If it starts hanging, will need to switch to the following code. The problem is that no data
-    # will be seen until it's done and if it hangs for example there will be no debug info.
-    # out, err = await p.communicate()
-    # return _RunOutput(p.returncode, out, err)
-
-    out = []
-    err = []
-
-    def tee(line, sink, pipe, label=""):
-        line = line.decode("utf-8").rstrip()
-        sink.append(line)
-        if not quiet:
-            print(label, line, file=pipe)
-
-    await asyncio.wait(
-        [
-            _read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:")),
-            _read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:")),
-        ],
-        timeout=timeout,
-    )
-    return _RunOutput(await p.wait(), out, err)
-
-
-def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
-    """
-    Args:
-        cmd (List[str]): A list of strings representing the command and its arguments to be executed.
-        env (Optional[Dict[str, str]]): A dictionary of environment variables to be passed to the subprocess.
-        stdin (Optional[Union[str, bytes]]): The input to be passed to the subprocess.
-        timeout (int): The maximum time in seconds to wait for the subprocess to complete.
-        quiet (bool): If True, suppresses output from the subprocess.
-        echo (bool): If True, prints the subprocess output to the console.
-    
-    Returns:
-        _RunOutput: An object containing the output of the executed subprocess, including stdout, stderr, and returncode.
-    
-    Raises:
-        RuntimeError: If the subprocess fails with a non-zero return code or produces no output.
-    """
-    loop = asyncio.get_event_loop()
-    result = loop.run_until_complete(
-        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
-    )
-
-    cmd_str = " ".join(cmd)
-    if result.returncode > 0:
-        stderr = "\n".join(result.stderr)
-        raise RuntimeError(
-            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
-            f"The combined stderr from workers follows:\n{stderr}"
-        )
-
-    # check that the subprocess actually did run and produced some output, should the test rely on
-    # the remote side to do the testing
-    if not result.stdout and not result.stderr:
-        raise RuntimeError(f"'{cmd_str}' produced no output.")
-
-    return result
-
-
-def pytest_xdist_worker_id():
-    """
-    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0
-    if `-n 1` or `pytest-xdist` isn't being used.
-    """
-    worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
-    worker = re.sub(r"^gw", "", worker, 0, re.M)
-    return int(worker)
-
-
-def get_torch_dist_unique_port():
-    """
-    Returns a port number that can be fed to `torch.distributed.launch`'s `--master_port` argument.
-
-    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same
-    port at once.
-    """
-    port = 29500
-    uniq_delta = pytest_xdist_worker_id()
-    return port + uniq_delta
-
-
-def nested_simplify(obj, decimals=3):
-    """
-    Simplifies an object by rounding float numbers, and downcasting tensors/numpy arrays to get simple equality test
-    within tests.
-    """
-    if isinstance(obj, list):
-        return [nested_simplify(item, decimals) for item in obj]
-    if isinstance(obj, tuple):
-        return tuple(nested_simplify(item, decimals) for item in obj)
-    if isinstance(obj, np.ndarray):
-        return nested_simplify(obj.tolist())
-    if isinstance(obj, Mapping):
-        return {nested_simplify(k, decimals): nested_simplify(v, decimals) for k, v in obj.items()}
-    if isinstance(obj, (str, int, np.int64)):
-        return obj
-    if obj is None:
-        return obj
-    if is_mindspore_available() and ops.is_tensor(obj):
-        return nested_simplify(obj.numpy().tolist())
-    if isinstance(obj, float):
-        return round(obj, decimals)
-    if isinstance(obj, (np.int32, np.float32)):
-        return nested_simplify(obj.item(), decimals)
-    raise RuntimeError(f"Not supported: {type(obj)}")
-
-
-def to_2tuple(x):
-    """
-    Converts the input value to a 2-tuple.
-    
-    Args:
-        x: The value to be converted. It can be of any type.
-    
-    Returns:
-        A 2-tuple with the input value. If the input value is already an iterable, it is returned as is.
-        Otherwise, a 2-tuple is created with the input value repeated twice.
-    
-    Raises:
-        None.
-    
-    """
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-# These utils relate to ensuring the right error message is received when running scripts
-class SubprocessCallException(Exception):
-    """SubprocessCallException"""
-def run_command(command: List[str], return_stdout=False):
-    """
-    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
-    if an error occurred while running `command`
-    """
-    try:
-        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
-        if return_stdout:
-            if hasattr(output, "decode"):
-                output = output.decode("utf-8")
-            return output
-    except subprocess.CalledProcessError as e:
-        raise SubprocessCallException(
-            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
-        ) from e
-    return None
-
-class RequestCounter:
-    """
-    Helper class that will count all requests made online.
-
-    Might not be robust if urllib3 changes its logging format but should be good enough for us.
-
-    Usage:
-    ```py
-    with RequestCounter() as counter:
-        _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-    assert counter["GET"] == 0
-    assert counter["HEAD"] == 1
-    assert counter.total_calls == 1
-    ```
-    """
-    def __enter__(self):
-        """
-        __enter__
-        
-        Args:
-            self: The instance of the RequestCounter class.
-        
-        Returns:
-            None. This method does not explicitly return a value.
-        
-        Raises:
-            No specific exceptions are raised within this method.
-        """
-        self._counter = defaultdict(int)
-        self.patcher = patch.object(urllib3.connectionpool.log, "debug", wraps=urllib3.connectionpool.log.debug)
-        self.mock = self.patcher.start()
-        return self
-
-    def __exit__(self, *args, **kwargs) -> None:
-        """
-        This method '__exit__' in the class 'RequestCounter' is called upon exiting a context manager. It updates the request counters based on the logged HTTP methods.
-        
-        Args:
-        - self: An instance of the 'RequestCounter' class. It represents the current instance of the class.
-        
-        Returns:
-        - None: This method does not return any value.
-        
-        Raises:
-        This method does not explicitly raise any exceptions.
-        """
-        for call in self.mock.call_args_list:
-            log = call.args[0] % call.args[1:]
-            for method in ("HEAD", "GET", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"):
-                if method in log:
-                    self._counter[method] += 1
-                    break
-        self.patcher.stop()
-
-    def __getitem__(self, key: str) -> int:
-        """
-        Retrieve the count associated with the specified key from the RequestCounter.
-        
-        Args:
-            self (RequestCounter): An instance of the RequestCounter class.
-            key (str): The key for which the count needs to be retrieved. It should be a string representing the identifier of the request.
-        
-        Returns:
-            int: The count associated with the specified key. This count indicates the number of times the request identified by the key has been made.
-        
-        Raises:
-            KeyError: If the specified key does not exist in the RequestCounter, a KeyError is raised indicating that the count for the key cannot be retrieved.
-        """
-        return self._counter[key]
-
-    @property
-    def total_calls(self) -> int:
-        """ 
-        Method to calculate the total number of calls made to the RequestCounter instance.
-        
-        Args:
-            self (RequestCounter): The instance of the RequestCounter class.
-                This parameter is automatically passed when calling the method.
-            
-        Returns:
-            int: The total number of calls made to the RequestCounter instance.
-                It is the sum of all the values stored in the internal counter.
-        
-        Raises:
-            No specific exceptions are raised by this method.
-        """
-        return sum(self._counter.values())
-
-def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None):
-    """
-    To decorate flaky tests. They will be retried on failures.
-
-    Args:
-        max_attempts (`int`, *optional*, defaults to 5):
-            The maximum number of attempts to retry the flaky test.
-        wait_before_retry (`float`, *optional*):
-            If provided, will wait that number of seconds before retrying the test.
-        description (`str`, *optional*):
-            A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors,
-            etc.)
-    """
-    def decorator(test_func_ref):
-        @functools.wraps(test_func_ref)
-        def wrapper(*args, **kwargs):
-            retry_count = 1
-
-            while retry_count < max_attempts:
-                try:
-                    return test_func_ref(*args, **kwargs)
-
-                except Exception as err:
-                    print(f"Test failed with {err} at try {retry_count}/{max_attempts}.", file=sys.stderr)
-                    if wait_before_retry is not None:
-                        time.sleep(wait_before_retry)
-                    retry_count += 1
-
-            return test_func_ref(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
-    """
-    To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
-
-    Args:
-        test_case (`unittest.TestCase`):
-            The test that will run `target_func`.
-        target_func (`Callable`):
-            The function implementing the actual testing logic.
-        inputs (`dict`, *optional*, defaults to `None`):
-            The inputs that will be passed to `target_func` through an (input) queue.
-        timeout (`int`, *optional*, defaults to `None`):
-            The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env.
-            variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`.
-    """
-    if timeout is None:
-        timeout = int(os.environ.get("PYTEST_TIMEOUT", 600))
-
-    start_methohd = "spawn"
-    ctx = multiprocessing.get_context(start_methohd)
-
-    input_queue = ctx.Queue(1)
-    output_queue = ctx.JoinableQueue(1)
-
-    # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle.
-    input_queue.put(inputs, timeout=timeout)
-
-    process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout))
-    process.start()
-    # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents
-    # the test to exit properly.
-    try:
-        results = output_queue.get(timeout=timeout)
-        output_queue.task_done()
-    except Exception as e:
-        process.terminate()
-        test_case.fail(e)
-    process.join(timeout=timeout)
-
-    if results["error"] is not None:
-        test_case.fail(f'{results["error"]}')
-
-
-# The following contains utils to run the documentation tests without having to overwrite any files.
-
-# The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
-# made as a print would otherwise fail the corresonding line.
-
-# To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
-
-
-def preprocess_string(string, skip_cuda_tests):
-    """Prepare a docstring or a `.md` file to be run by doctest.
-
-    The argument `string` would be the whole file content if it is a `.md` file. For a python file, it would be one of
-    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
-    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
-    `string`.
-    """
-    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
-    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
-    is_cuda_found = False
-    for i, codeblock in enumerate(codeblocks):
-        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
-            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
-        if (
-            (">>>" in codeblock or "..." in codeblock)
-            and re.search(r"cuda|to\(0\)|device=0", codeblock)
-            and skip_cuda_tests
-        ):
-            is_cuda_found = True
-            break
-
-    modified_string = ""
-    if not is_cuda_found:
-        modified_string = "".join(codeblocks)
-
-    return modified_string
-
-
-class HfDocTestParser(doctest.DocTestParser):
-    """
-    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
-    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
-    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
-
-    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
-    """
-    # This regular expression is used to find doctest examples in a
-    # string.  It defines three groups: `source` is the source code
-    # (including leading indentation and prompts); `indent` is the
-    # indentation of the first (PS1) line of the source code; and
-    # `want` is the expected output (including leading indentation).
-    # fmt: off
-    _EXAMPLE_RE = re.compile(r'''
-        # Source consists of a PS1 line followed by zero or more PS2 lines.
-        (?P<source>
-            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
-            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
-        \n?
-        # Want consists of any non-blank lines that do not start with PS1.
-        (?P<want> (?:(?![ ]*$)    # Not a blank line
-             (?![ ]*>>>)          # Not a line starting with PS1
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:\n|$)  # Match a new line or end of string
-          )*)
-        ''', re.MULTILINE | re.VERBOSE
-    )
-    # fmt: on
-
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-
-    def parse(self, string, name="<string>"):
-        """
-        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
-        calling `super().parse`
-        """
-        string = preprocess_string(string, self.skip_cuda_tests)
-        return super().parse(string, name)
-
-
-class HfDoctestModule(Module):
-    """
-    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
-    tests.
-    """
-    def collect(self) -> Iterable[DoctestItem]:
-        """
-        Collects doctests from the specified module.
-        
-        Args:
-            self (HfDoctestModule): The instance of the HfDoctestModule class.
-        
-        Returns:
-            Iterable[DoctestItem]: A collection of doctests represented as DoctestItem objects.
-        
-        Raises:
-            ImportError: If the module cannot be imported and the 'doctest_ignore_import_errors' configuration option is not set.
-            Skip: If the 'doctest_ignore_import_errors' configuration option is set and the module cannot be imported.
-        """
-        class MockAwareDocTestFinder(doctest.DocTestFinder):
-            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
-
-            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
-            """
-
-            def _find_lineno(self, obj, source_lines):
-                """Doctest code does not take into account `@property`, this
-                is a hackish way to fix it. https://bugs.python.org/issue17446
-
-                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
-                reported upstream. #8796
-                """
-                if isinstance(obj, property):
-                    obj = getattr(obj, "fget", obj)
-
-                if hasattr(obj, "__wrapped__"):
-                    # Get the main obj in case of it being wrapped
-                    obj = inspect.unwrap(obj)
-
-                # Type ignored because this is a private function.
-                return super()._find_lineno(  # type:ignore[misc]
-                    obj,
-                    source_lines,
-                )
-
-            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
-                if _is_mocked(obj):
-                    return
-                with _patch_unwrap_mock_aware():
-                    # Type ignored because this is a private function.
-                    super()._find(  # type:ignore[misc]
-                        tests, obj, name, module, source_lines, globs, seen
-                    )
-
-        if self.path.name == "conftest.py":
-            module = self.config.pluginmanager._importconftest(
-                self.path,
-                self.config.getoption("importmode"),
-                rootpath=self.config.rootpath,
-            )
-        else:
-            try:
-                module = import_path(
-                    self.path,
-                    root=self.config.rootpath,
-                    mode=self.config.getoption("importmode"),
-                )
-            except ImportError:
-                if self.config.getvalue("doctest_ignore_import_errors"):
-                    skip(f"unable to import module {self.path}")
-                else:
-                    raise
-
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        optionflags = get_optionflags(self)
-        runner = _get_runner(
-            verbose=False,
-            optionflags=optionflags,
-            checker=_get_checker(),
-            continue_on_failure=_get_continue_on_failure(self.config),
-        )
-        for test in finder.find(module, module.__name__):
-            if test.examples:  # skip empty doctests and cuda
-                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
-
-
-def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs):
-    """
-    Executes a device-agnostic dispatch based on the given device and dispatch table.
-    
-    Args:
-        device (str): The device for which the dispatch is performed.
-        dispatch_table (Dict[str, Callable]): A dictionary containing the dispatch functions for different devices.
-    
-    Returns:
-        None: Returns None if the dispatch function for the given device is None.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    """
-    if device not in dispatch_table:
-        return dispatch_table["default"](*args, **kwargs)
-
-    fn = dispatch_table[device]
-
-    # Some device agnostic functions return values. Need to guard against `None`
-    # instead at user level.
-    if fn is None:
-        return None
-    return fn(*args, **kwargs)
-
-def get_tests_dir(append_path=None):
-    """
-    Args:
-        append_path: optional path to append to the tests dir path
-
-    Return:
-        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
-        joined after the `tests` dir the former is provided.
-
-    """
-    # this function caller's __file__
-    caller__file__ = inspect.stack()[1][1]
-    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
-
-    while not tests_dir.endswith("tests"):
-        tests_dir = os.path.dirname(tests_dir)
-
-    if append_path:
-        return os.path.join(tests_dir, append_path)
-    return tests_dir
-
-def check_json_file_has_correct_format(file_path):
-    '''
-    Check if the provided JSON file has the correct format.
-    
-    Args:
-        file_path (str): The path to the JSON file to be checked.
-    
-    Returns:
-        None: This function does not return any value.
-    
-    Raises:
-        AssertionError: If the JSON file does not have the correct format as per the specified conditions.
-        FileNotFoundError: If the specified file_path does not exist.
-        UnicodeDecodeError: If the file cannot be decoded using the specified encoding.
-    '''
-    with open(file_path, "r", encoding='utf-8') as f:
-        lines = f.readlines()
-        if len(lines) == 1:
-            # length can only be 1 if dict is empty
-            assert lines[0] == "{}"
-        else:
-            # otherwise make sure json has correct format (at least 3 lines)
-            assert len(lines) >= 3
-            # each key one line, ident should be 2, min length is 3
-            assert lines[0].strip() == "{"
-            for _ in lines[1:-1]:
-                left_indent = len(lines[1]) - len(lines[1].lstrip())
-                assert left_indent == 2
-            assert lines[-1].strip() == "}"
-
-_run_staging = parse_flag_from_env("MINDNLP_CO_STAGING", default=False)
-
-def is_staging_test(test_case):
-    """
-    Decorator marking a test as a staging test.
-
-    Those tests will run using the staging environment of huggingface.co instead of the real model hub.
-    """
-    if not _run_staging:
-        return unittest.skip("test is staging test")(test_case)
-    else:
-        try:
-            import pytest  # We don't need a hard dependency on pytest in the main library
-        except ImportError:
-            return test_case
-        else:
-            return pytest.mark.is_staging_test()(test_case)
-
-
-def require_soundfile(test_case):
-    """
-    Decorator marking a test that requires soundfile
-
-    These tests are skipped when soundfile isn't installed.
-
-    """
-    return unittest.skipUnless(is_soundfile_availble(), "test requires soundfile")(
-        test_case
-    )
-
-def backend_empty_cache():
-    if hasattr(mindspore, 'hal'):
-        mindspore.hal.empty_cache()
diff --git a/mindnlp/utils/winfcntlock.py b/mindnlp/utils/winfcntlock.py
deleted file mode 100644
index a3988e2c7..000000000
--- a/mindnlp/utils/winfcntlock.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""fcntl replacement for Windows."""
-import win32con # pylint: disable=import-error
-import pywintypes # pylint: disable=import-error
-import win32file # pylint: disable=import-error
-
-
-LOCK_EX = win32con.LOCKFILE_EXCLUSIVE_LOCK
-LOCK_SH = 0  # The default value
-LOCK_NB = win32con.LOCKFILE_FAIL_IMMEDIATELY
-__overlapped = pywintypes.OVERLAPPED()
-
-def lock(file, flags):
-    hfile = win32file._get_osfhandle(file.fileno())
-    win32file.LockFileEx(hfile, flags, 0, 0xffff0000, __overlapped)
-
-def unlock(file):
-    hfile = win32file._get_osfhandle(file.fileno())
-    win32file.UnlockFileEx(hfile, 0, 0xffff0000, __overlapped)
diff --git a/mindnlp/vocab/__init__.py b/mindnlp/vocab/__init__.py
deleted file mode 100644
index 20d77ed18..000000000
--- a/mindnlp/vocab/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-vocab init
-"""
-
-from .vocab import Vocab
diff --git a/mindnlp/vocab/vocab.py b/mindnlp/vocab/vocab.py
deleted file mode 100644
index 1b67c653c..000000000
--- a/mindnlp/vocab/vocab.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint:disable=I1101
-
-"""Vocab Class"""
-
-import os
-import re
-import warnings
-from typing import Union
-from mindspore.dataset import TextBaseDataset
-
-from ..configs import DEFAULT_ROOT
-from ..utils.download import get_from_cache
-
-class Vocab:
-    r"""
-    Creates a vocab object which maps tokens to indices.
-    """
-    def __init__(self, list_or_dict: Union[list, dict],
-                 special_tokens: Union[list, tuple] = None,
-                 special_first: bool = True):
-        """
-        Initialize the Vocab class with the provided list or dictionary of tokens along with optional special tokens.
-        
-        Args:
-        - list_or_dict (Union[list, dict]): A list or dictionary containing tokens. If a list is provided, each token will be assigned a unique index. If a dictionary is provided, keys must be strings and
-values must be integers representing token indices.
-        - special_tokens (Union[list, tuple], optional): A list or tuple containing special tokens to be included in the vocabulary. Defaults to None.
-        - special_first (bool, optional): A boolean flag indicating whether special tokens should be added at the beginning of the vocabulary. Defaults to True.
-        
-        Returns:
-        None. This method initializes the Vocab object with the provided tokens and special tokens.
-        
-        Raises:
-        - ValueError: Raised if keys in the dictionary are not strings, values in the dictionary are not integers, or if the input is not a list or dictionary.
-        """
-        self._token_dict = {}
-
-        sp_len = len(special_tokens) if special_tokens is not None and special_first else 0
-
-        if isinstance(list_or_dict, list):
-            for index, value in enumerate(list_or_dict):
-                self._token_dict[value] = index + sp_len
-        elif isinstance(list_or_dict, dict):
-            for key, value in list_or_dict.items():
-                if not isinstance(key, str):
-                    raise ValueError(f'keys in dict must be str, but got {type(key)}')
-                if not isinstance(value, int):
-                    raise ValueError(f'values in dict must be int, but got {type(key)}')
-                self._token_dict[key] = value + sp_len
-        else:
-            raise ValueError(f'Vocab only support list or dict, but get {type(list_or_dict)}')
-
-        if special_tokens is not None:
-            offset = 0 if special_first else len(self._token_dict)
-            for idx, tok in enumerate(special_tokens):
-                self._token_dict[tok] = idx + offset
-
-        self._index_dict = {v: k for k, v in self._token_dict.items()}
-
-    def __len__(self) -> int:
-        r"""
-        Returns:
-            - int, The length of the vocab.
-        """
-        return len(self._token_dict)
-
-    def __contains__(self, token: str) -> bool:
-        r"""
-        Args:
-            token (str): The token for which to check the membership.
-
-        Returns:
-            - bool, Whether the token is member of vocab or not.
-        """
-        return token in self._token_dict
-
-    def __getitem__(self, token: str) -> int:
-        r"""
-        Args:
-            token (str): The token used to lookup the corresponding index.
-
-        Returns:
-            - int, The index corresponding to the associated token.
-        """
-        return self._token_dict.get(token, None)
-
-    def __call__(self, token_or_id):
-        """
-        The __call__ method in the Vocab class retrieves token information based on the provided token or token ID.
-        
-        Args:
-            self (Vocab): The instance of the Vocab class.
-            token_or_id (str or int): The token or token ID for which information is to be retrieved. 
-                If a string is provided, the method attempts to retrieve the token information using the token as a key in the _token_dict. 
-                If an integer is provided, the method attempts to retrieve the token information using the token ID as a key in the _index_dict.
-        
-        Returns:
-            None: If the token or token ID is not found in the respective dictionaries, the method returns None.
-        
-        Raises:
-            ValueError: If the token_or_id parameter is not a string or integer, a ValueError is raised with a message indicating the unsupported token type.
-        """
-        if isinstance(token_or_id, str):
-            return self._token_dict.get(token_or_id, None)
-        if isinstance(token_or_id, int):
-            return self._index_dict.get(token_or_id, None)
-
-        raise ValueError(f'not support token type {type(token_or_id)}')
-
-    def lookup_ids(self, token_or_list):
-        """
-        Converts a token string or a sequence of tokens in a single integer id or a sequence of ids.
-
-        Args:
-            token_or_list (Union[str, list[str]]): One or several token(s) to convert to token id(s).
-
-        Returns:
-            - list[int], The token id or list of token ids.
-              if only one token used to lookup,
-              return one id instead of a list of ids.
-
-        Examples:
-            >>> import mindspore.dataset.text as text
-            >>> vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True)
-            >>> ids = vocab.lookup_ids(["w1", "w3"])
-        """
-        if isinstance(token_or_list, str):
-            return self._token_dict.get(token_or_list)
-
-        if isinstance(token_or_list, list):
-            return_list = []
-            for token in token_or_list:
-                if token not in self._token_dict:
-                    raise ValueError(f"{token} is not in vocab.")
-                return_list.append(self._token_dict[token])
-            return return_list
-
-        raise ValueError(f'lookup only support str and list, but got {type(token_or_list)}.')
-
-    def lookup_tokens(self, index_or_list):
-        """
-        Converts a single index or a sequence of indices in a token or a sequence of tokens.
-        If id does not exist, return empty string.
-
-        Args:
-            index_or_list (Union[int, list[int]]): The token id (or token ids) to convert to tokens.
-
-        Returns:
-            - List<str>, The decoded token(s).
-              if only one id used to lookup,
-              return one token instead of a list of tokens.
-
-        Raises:
-            RuntimeError: If 'ids' is not in vocab.
-
-        Examples:
-            >>> import mindspore.dataset.text as text
-            >>> vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True)
-            >>> token = vocab.lookup_tokens(0)
-        """
-        if isinstance(index_or_list, int):
-            return self._index_dict.get(index_or_list, None)
-
-        if isinstance(index_or_list, list):
-            return_list = []
-            for idx in index_or_list:
-                if idx not in self._index_dict:
-                    raise ValueError(f"{idx} is not in vocab.")
-                return_list.append(self._index_dict[idx])
-            return return_list
-
-        raise ValueError(f'lookup only support int and list, but got {type(index_or_list)}.')
-
-    def append_token(self, token):
-        r"""
-        Args:
-            token (str): The token used to lookup the corresponding index.
-
-        Raises:
-            RuntimeError: If `token` already exists in the vocab.
-
-        """
-        if isinstance(token, str):
-            if token in self._token_dict:
-                warnings.warn(f"{token} already exists in the vocab.")
-            else:
-                append_id = len(self._token_dict)
-                self._token_dict[token] = append_id
-                self._index_dict[append_id] = token
-        else:
-            raise TypeError(f"{token} is not str.")
-
-    @classmethod
-    def from_dataset(cls, dataset, columns=None, freq_range=None, top_k=None, special_tokens=None, special_first=True):
-        """
-        Build a Vocab from a dataset.
-
-        This would collect all unique words in a dataset and return a vocab within
-        the frequency range specified by user in freq_range. User would be warned if no words fall into the frequency.
-        Words in vocab are ordered from the highest frequency to the lowest frequency. Words with the same frequency
-        would be ordered lexicographically.
-
-        Args:
-            dataset (Dataset): dataset to build vocab from.
-            columns (list[str], optional): column names to get words from. It can be a list of column names.
-                Default: None.
-            freq_range (tuple, optional): A tuple of integers (min_frequency, max_frequency). Words within the frequency
-                range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency=0 is the same as
-                min_frequency=1. max_frequency > total_words is the same as max_frequency = total_words.
-                min_frequency/max_frequency can be None, which corresponds to 0/total_words separately.
-                Default: None, all words are included.
-            top_k (int, optional): top_k is greater than 0. Number of words to be built into vocab. top_k means most
-                frequent words are taken. top_k is taken after freq_range. If not enough top_k, all words will be taken.
-                Default: None, all words are included.
-            special_tokens (list, optional):  A list of strings, each one is a special token. For example
-                special_tokens=["<pad>","<unk>"]. Default: None, no special tokens will be added.
-            special_first (bool, optional): Whether special_tokens will be prepended/appended to vocab. If
-                special_tokens is specified and special_first is set to True, special_tokens will be prepended.
-                Default: True.
-
-        Returns:
-            - Vocab, Vocab object built from the dataset.
-
-        Examples:
-            >>> import mindspore.dataset as ds
-            >>> import mindspore.dataset.text as text
-            >>> dataset = ds.TextFileDataset("/path/to/sentence/piece/vocab/file", shuffle=False)
-            >>> vocab = text.Vocab.from_dataset(dataset, "text", freq_range=None, top_k=None,
-            ...                                 special_tokens=["<pad>", "<unk>"],
-            ...                                 special_first=True)
-            >>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
-        """
-        if not isinstance(dataset, TextBaseDataset):
-            raise ValueError('dataset must be subclass of TextBaseDataset.')
-
-        ds_vocab = dataset._build_vocab(columns, freq_range, top_k, special_tokens, special_first)
-        vocab = Vocab(ds_vocab.vocab())
-
-        return vocab
-
-    @classmethod
-    def from_pretrained(cls, name="glove.6B.50d", root=DEFAULT_ROOT,
-                        special_tokens=("<pad>", "<unk>"), special_first=True):
-        r"""
-        Args:
-            name (str): The name of the pretrained vector. Default: "glove.6B.50d".
-            root (str): Default storage directory. Default: DEFAULT_ROOT.
-            special_tokens (str|Tuple[str]): List of special participles. Default: ("<pad>", "<unk>").
-            special_first (bool): Indicates whether special participles from special_tokens will be added to
-                the top of the dictionary. If True, add special_tokens to the beginning of the dictionary,
-                otherwise add them to the end. Default: True.
-
-        Returns:
-            - Vocab, Returns a vocab generated from the url download.
-        """
-        tokens = []
-        url = pretrained_aliases[name]
-
-        cache_dir = os.path.join(root, "vocabs")
-        download_file_name = re.sub(r".+/", "", url)
-        path = get_from_cache(download_file_name=download_file_name, cache_dir=cache_dir, url=url)
-
-        with open(path, 'r', encoding='utf-8') as file:
-            file.readline()
-            for line in file:
-                tokens.append(line.rstrip("\n"))
-
-        vocab = Vocab(tokens, list(special_tokens), special_first)
-
-        return vocab
-
-    @property
-    def vocab(self):
-        """return vocab dict."""
-        return self._token_dict
-
-pretrained_aliases = {
-    "glove.6B.50d": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Glove/glove.6B.50d.txt",
-    "glove.6B.100d": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Glove/glove.6B.100d.txt",
-    "glove.6B.200d": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Glove/glove.6B.200d.txt",
-    "glove.6B.300d": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Glove/glove.6B.300d.txt",
-    "fasttext": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Fasttext/wiki-news-300d-1M.txt",
-    "fasttext-subword": "https://download.mindspore.cn/toolkits/mindnlp/vocab/Fasttext/wiki-news-300d-1M-subword.txt",
-}
-
-__all__ = ['Vocab']
diff --git a/setup.py b/setup.py
index b71b04c18..04a5544d7 100644
--- a/setup.py
+++ b/setup.py
@@ -131,7 +131,7 @@ def run(self):
         'pyctcdecode',
         'pytest==7.2.0',
         'pillow>=10.0.0',
-        'mindtorch@git+https://github.com/lvyufeng/mindtorch.git'
+        'mindtorch@git+https://openi.pcl.ac.cn/lvyufeng/mindtorch.git'
     ],
     classifiers=[
         'License :: OSI Approved :: Apache Software License'