diff --git a/mindnlp/accelerate/__init__.py b/mindnlp/accelerate/__init__.py
new file mode 100644
index 000000000..51c7f5deb
--- /dev/null
+++ b/mindnlp/accelerate/__init__.py
@@ -0,0 +1,18 @@
+import sys
+import accelerate
+from transformers.utils import _LazyModule
+
+_import_structure = {
+    "utils": [
+        'DistributedType',
+
+    ]
+}
+
+sys.modules[__name__] = _LazyModule(
+    'accelerate',
+    accelerate.__file__,
+    _import_structure,
+    module_spec=__spec__,
+    extra_objects={"__version__": accelerate.__version__},
+)
diff --git a/mindnlp/core/__init__.py b/mindnlp/core/__init__.py
index ec74c77c0..1a5ce8335 100644
--- a/mindnlp/core/__init__.py
+++ b/mindnlp/core/__init__.py
@@ -46,5 +46,28 @@
 from ._bind import get_default_dtype, set_default_dtype
 
 from . import profiler, cuda, optim, amp, compiler, jit, version, __future__, overrides, \
-    return_types
+    return_types, linalg
 
+from ._lowrank import svd_lowrank
+
+def _has_compatible_shallow_copy_type(tensor, other):
+    """
+    Mimics the behavior of mindtorch._has_compatible_shallow_copy_type.
+
+    Args:
+        tensor (mindtorch.Tensor): The source tensor.
+        other (mindtorch.Tensor): The target tensor to check compatibility.
+
+    Returns:
+        bool: True if `tensor` and `other` have compatible types for shallow copy.
+    """
+    # Check if both tensors have the same type
+    if not is_tensor(tensor) or not is_tensor(other):
+        return False
+
+    # Check if both tensors are on the same device
+    if tensor.shape != other.shape:
+        return False
+
+    # Compatibility confirmed
+    return True
\ No newline at end of file
diff --git a/mindnlp/core/_dtype.py b/mindnlp/core/_dtype.py
index 5bb777f92..b145865c5 100644
--- a/mindnlp/core/_dtype.py
+++ b/mindnlp/core/_dtype.py
@@ -9,6 +9,7 @@ def is_floating_point(self):
     return isinstance(self, (typing.Float, typing.BFloat16))
 
 Type.is_floating_point = is_floating_point
+Type.__str__ = Type.__repr__
 
 half = float16
 float = float32
@@ -19,6 +20,7 @@ def is_floating_point(self):
 bool = bool_
 
 float8_e4m3fn = None # TODO: not support fp8 for now
+float8_e5m2 = None
 
 np2dtype = {
     np.bool_: bool,
diff --git a/mindnlp/core/_dynamo/utils.py b/mindnlp/core/_dynamo/utils.py
deleted file mode 100644
index 1f7544cae..000000000
--- a/mindnlp/core/_dynamo/utils.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def is_compile_supported(device_type):
-    compile_supported = False
-    return compile_supported
\ No newline at end of file
diff --git a/mindnlp/core/_lowrank.py b/mindnlp/core/_lowrank.py
index c03d4f468..819e42f00 100644
--- a/mindnlp/core/_lowrank.py
+++ b/mindnlp/core/_lowrank.py
@@ -5,8 +5,8 @@
 from typing import Optional
 
 from mindnlp import core
-from mindnlp.core import _linalg_utils as _utils, Tensor
-from core.overrides import handle_torch_function, has_torch_function
+from . import _linalg_utils as _utils, Tensor
+from .overrides import handle_torch_function, has_torch_function
 
 
 def get_approximate_basis(
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
index f4251f01b..ef287c4ba 100644
--- a/mindnlp/core/_tensor.py
+++ b/mindnlp/core/_tensor.py
@@ -1,17 +1,29 @@
+import math
 import numpy as np
 import mindspore
 from mindspore import Tensor
 from mindspore.common.tensor import _TensorMeta
+from mindspore._c_expression.typing import Type
 try:
     from mindspore.common._stub_tensor import StubTensor
 except:
     class StubTensor: pass
 
+from . import ops, _dtype
 from ._dtype import dtype2np
-from ._bind import get_default_device
-
-from ._dtype import *
+from ._bind import get_default_device, device_
+from .configs import use_pyboost, ON_A1
+from .storage import UntypedStorage
 
+DTYPE_ELEMENT_SIZE_MAP = {
+    mindspore.float64: 8,
+    mindspore.int64: 8,
+    mindspore.int32: 4,
+    mindspore.float32: 4,
+    mindspore.int16: 2,
+    mindspore.bfloat16: 2,
+    mindspore.float16: 2,
+}
 
 class TypedTensorMeta(_TensorMeta):
     def __isinstancecheck__(self, instance):
@@ -20,31 +32,31 @@ def __isinstancecheck__(self, instance):
         return instance.dtype == self.dtype
 
 class LongTensor(Tensor, metaclass=TypedTensorMeta):
-    dtype = long
+    dtype = _dtype.long
     def __init__(self, data, device=None):
-        super().__init__(data, dtype=long)
+        super().__init__(data, dtype=_dtype.long)
 
 class FloatTensor(Tensor, metaclass=TypedTensorMeta):
-    dtype = float32
+    dtype = _dtype.float32
     def __init__(self, data, device=None):
-        super().__init__(data, dtype=float32)
+        super().__init__(data, dtype=_dtype.float32)
 
 
 class HalfTensor(Tensor, metaclass=TypedTensorMeta):
-    dtype = float16
+    dtype = _dtype.float16
     def __init__(self, data, device=None):
-        super().__init__(data, dtype=float16)
+        super().__init__(data, dtype=_dtype.float16)
 
 class BFloat16Tensor(Tensor, metaclass=TypedTensorMeta):
-    dtype = float16
+    dtype = _dtype.float16
     def __init__(self, data, device=None):
-        super().__init__(data, dtype=bfloat16)
+        super().__init__(data, dtype=_dtype.bfloat16)
 
 
 class BoolTensor(Tensor, metaclass=TypedTensorMeta):
-    dtype = bool
+    dtype = _dtype.bool
     def __init__(self, data, device=None):
-        super().__init__(data, dtype=bool)
+        super().__init__(data, dtype=_dtype.bool)
 
 def tensor(data, *, dtype=None, device=None, requires_grad=False):
     if isinstance(data, Tensor):
@@ -105,8 +117,8 @@ def data_ptr(self):
     Tensor.data_ptr = data_ptr
     StubTensor.data_ptr = data_ptr
 
-    Tensor.device = None
-    StubTensor.device = None
+    Tensor.device = device_('not support yet.')
+    StubTensor.device = device_('not support yet.')
 
     def _expand(self, *size):
         if len(size) == 1:
@@ -115,3 +127,136 @@ def _expand(self, *size):
 
     Tensor.expand = _expand
     StubTensor.expand = _expand
+
+    def clone(self, *args, **kwargs):
+        return self.copy()
+    
+    Tensor.clone = clone
+    StubTensor.clone = clone
+
+    def _repeat(self, *sizes):
+        if len(sizes) == 1 and isinstance(sizes[0], (list, tuple)):
+            sizes = sizes[0]
+        return ops.tile(self, tuple(sizes))
+
+    Tensor.repeat = _repeat
+    StubTensor.repeat = _repeat
+
+    def __or__(self, other):
+        if isinstance(other, (int, bool, float, Tensor)):
+            return ops.bitwise_or(self, other)
+        raise TypeError("Unsupported operand type(s) for |: 'Tensor' and '{}'".format(type(other)))
+
+    Tensor.__or__ = __or__
+    StubTensor.__or__ = __or__
+
+    def __and__(self, other):
+        if isinstance(other, (int, bool, float, Tensor)):
+            return ops.bitwise_and(self, other)
+        raise TypeError("Unsupported operand type(s) for &: 'Tensor' and '{}'".format(type(other)))
+
+    Tensor.__and__ = __and__
+    StubTensor.__and__ = __and__
+
+    def detach(self):
+        return ops.stop_gradient(self)
+
+    Tensor.detach = detach
+    StubTensor.detach = detach
+
+    origin_getitem = Tensor.__getitem__
+    def __getitem__(self, slices):
+        # if 0 in self.shape:
+        #     return self
+        if isinstance(slices, tuple):
+            new_slices = ()
+            for s in slices:
+                if isinstance(s, range):
+                    s = slice(s.start, s.stop, s.step)
+                new_slices += (s,)
+            slices = new_slices
+        return origin_getitem(self, slices)
+
+    Tensor.__getitem__ = __getitem__
+    StubTensor.__getitem__ = __getitem__
+
+    def numel(self):
+        return math.prod(self.shape)
+
+    Tensor.numel = numel
+    StubTensor.numel = numel
+    Tensor.nelement = numel
+    StubTensor.nelement = numel
+
+    @property
+    def nbytes(self):
+        return self.numel() * self.element_size()
+
+    Tensor.nbytes = nbytes
+    StubTensor.nbytes = nbytes
+
+    Tensor.normal_ = ops.inplace_normal
+    StubTensor.normal_ = ops.inplace_normal
+
+
+    Tensor.softmax = ops.softmax
+    StubTensor.softmax = ops.softmax
+
+    Tensor.squeeze = ops.squeeze
+    StubTensor.squeeze = ops.squeeze
+
+    Tensor.unsqueeze = ops.unsqueeze
+    StubTensor.unsqueeze = ops.unsqueeze
+
+    def log_softmax(self, dim=-1, dtype=None):
+        if use_pyboost():
+            return mindspore.mint.nn.functional.log_softmax(self, dim=dim, dtype=dtype)
+        out = mindspore.ops.log_softmax(self, dim)
+        if dtype is not None:
+            out = out.to(dtype)
+        return out
+
+    Tensor.log_softmax = log_softmax
+    StubTensor.log_softmax = log_softmax
+
+    def untyped_storage(self):
+        return UntypedStorage(self)
+    
+    Tensor.untyped_storage = untyped_storage
+    StubTensor.untyped_storage = untyped_storage
+
+    def element_size(self,):
+        return DTYPE_ELEMENT_SIZE_MAP[self.dtype]
+
+    Tensor.element_size = element_size
+    StubTensor.element_size = element_size
+
+    @property
+    def layout(self):
+        return None
+
+    Tensor.layout = layout
+    StubTensor.layout = layout
+
+    def __add__(self, other):
+        # if 0 in self.shape:
+        #     return self
+        return ops.add(self, other)
+    
+    Tensor.__add__ = __add__
+    StubTensor.__add__ = __add__
+
+    Tensor.repeat_interleave = ops.repeat_interleave
+    StubTensor.repeat_interleave = ops.repeat_interleave
+
+    def dim(self):
+        return self.ndim
+
+    Tensor.dim = dim
+    StubTensor.dim = dim
+
+    def unfold(self, dimension, size, step):
+        return ops.unfold(self, dimension, size, step)
+
+    Tensor.unfold = unfold
+    StubTensor.unfold = unfold
diff --git a/mindnlp/core/autograd/__init__.py b/mindnlp/core/autograd/__init__.py
index f579cc99b..0f2fd5d47 100644
--- a/mindnlp/core/autograd/__init__.py
+++ b/mindnlp/core/autograd/__init__.py
@@ -1,4 +1,4 @@
 """autograd"""
 from .node import Node
-from .function import Function
+from .function import Function, value_and_grad
 from .grad_mode import no_grad, enable_grad, inference_mode
diff --git a/mindnlp/core/autograd/function.py b/mindnlp/core/autograd/function.py
index e7e35a925..02a1e9855 100644
--- a/mindnlp/core/autograd/function.py
+++ b/mindnlp/core/autograd/function.py
@@ -1,57 +1,110 @@
 """functional autograd"""
 from collections.abc import Generator
-from dataclasses import dataclass
-from typing import Tuple, Any, Optional, Type, Sequence
-import functools
-
-@dataclass(unsafe_hash=True)
-class Context:
-    """
-    Context class is used by `Function` to store information during the forward pass.
-    """
-
-    no_grad: bool = False
-    saved_values: Tuple[Any, ...] = ()
-
-    def save_for_backward(self, *values: Any) -> None:
-        "Store the given `values` if they need to be used during backpropagation."
-        if self.no_grad:
-            return
-        self.saved_values = values
-
-    @property
-    def saved_tensors(self) -> Tuple[Any, ...]:
-        return self.saved_values
-
-# Constructors
-class Function:
-    @classmethod
-    def _backward(cls, ctx: Context, *grad_out):
-        return cls.backward(ctx, *grad_out)  # type: ignore
-
-    @classmethod
-    def _forward(cls, ctx: Context, *inps, **kwargs):
-        return cls.forward(ctx, *inps, **kwargs)  # type: ignore
-
-    @classmethod
-    def apply(cls, *vals, **kwargs):
-        # Create the context.
-        ctx = Context(not requires_grad)
-        # Call forward with the variables.
-        results = cls._forward(ctx, *vals, **kwargs)
-        requires_grad = any([x.requires_grad for x in vals])
-
-        if requires_grad: # cut useless nodes
-            generation = max([x.generation for x in vals])
-            ctx.outputs = [weakref.ref(output) for output in outputs]
-            back = History(cls, ctx, generation)
-            for output in outputs:
-                output.set_creator(back)
-
-        return outputs if len(outputs) > 1 else outputs[0]
-
-    def forward(self, xs):
-        raise NotImplementedError()
-
-    def backward(self, gys):
-        raise NotImplementedError()
+
+import mindspore
+from mindspore.ops.composite import GradOperation
+from mindspore.ops import stop_gradient
+from mindspore.common.api import _pynative_executor
+from mindspore._c_expression import Cell_
+from .grad_mode import no_grad
+
+try:
+    from mindspore import _Function as Function
+except:
+    Function = None
+
+grad_ = GradOperation(False, True, False)
+grad_sens_ = GradOperation(False, True, True)
+grad_input_sens_ = GradOperation(True, True, True)
+
+def value_and_grad(fn, params_or_argnums, has_aux=False, attach_grads=True):
+    use_argnums = False
+    if isinstance(params_or_argnums, Generator):
+        params_or_argnums = tuple(params_or_argnums)
+
+    if isinstance(params_or_argnums[0], int):
+        use_argnums = True
+
+    def fn_aux(*args):
+        outputs = fn(*args)
+        no_grad_outputs = ()
+        for out in outputs[1:]:
+            no_grad_outputs += (stop_gradient(out),)
+        return outputs[0], no_grad_outputs
+
+    if has_aux:
+        fn_ = fn_aux
+    else:
+        fn_ = fn
+
+    def value_and_grad_f(*args, **kwargs):
+        _pynative_executor.set_grad_flag(True)
+        _pynative_executor.new_graph(fn, *args, **kwargs)
+        values = fn_(*args, **kwargs)
+        _pynative_executor.end_graph(fn, values, *args, **kwargs)
+
+        run_args = args
+        if kwargs:
+            run_args = args + tuple(kwargs.values())
+
+        grads = _pynative_executor.grad(fn_, grad_, params_or_argnums, None, *run_args)
+        grads = tuple(mindspore.Tensor(grad) for grad in grads)
+        if attach_grads:
+            for param, grad in zip(params_or_argnums, grads):
+                if param.grad is None:
+                    param.grad = grad
+                else:
+                    param.grad += grad
+            return values
+        return values, grads
+    return value_and_grad_f
+
+def grad(fn, params_or_argnums=None, has_aux=False):
+    value_and_grad_f = value_and_grad(fn, params_or_argnums, has_aux)
+    def grad_f(*args):
+        _, g = value_and_grad_f(*args)
+        return g
+    return grad_f
+
+
+if Function is None:
+    class Function(Cell_):
+        def __init__(self):
+            super().__init__(str(self.__class__)[8:-2])
+            self.saved_tensors = []
+            self.used_bprop_inputs = []
+
+        def save_for_backward(self, *args):
+            if isinstance(args, tuple):
+                self.saved_tensors.extend(list(args))
+            else:
+                self.saved_tensors.append(args)
+
+        @staticmethod
+        def forward(ctx, *args, **kwargs):
+            raise NotImplementedError
+
+        @staticmethod
+        def backward(ctx, *args, **kwargs):
+            raise NotImplementedError
+
+        def construct(self, *args, **kwargs):
+            self.needs_input_grad = [input_.requires_grad if hasattr(input_, 'requires_grad') else False for input_ in args]
+            args = (self,) + args
+            return self.forward(*args, **kwargs)
+
+        def bprop(self, *args, **kwargs):
+            args = (args[-1],)
+            args = (self,) + args
+            return self.backward(*args, **kwargs)
+
+        def __call__(self, *args, **kwargs):
+            with no_grad():
+                output = self.construct(*args, **kwargs)
+            if _pynative_executor.requires_grad():
+                _pynative_executor.call_custom_bprop(self, output, *args, **kwargs)
+            return output
+
+        @classmethod
+        def apply(cls, *args, **kwargs):
+            return cls()(*args, **kwargs)
diff --git a/mindnlp/core/autograd/functions/custom.py b/mindnlp/core/autograd/functions/custom.py
deleted file mode 100644
index 5f53b3bd6..000000000
--- a/mindnlp/core/autograd/functions/custom.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from mindspore._c_expression import TensorPy as MSTensor
-
-from mindnlp import core
-from core._prims.ascend import cast_npu
-from core._prims.cpu import cast_cpu
-from ..node import Node
-
-
-class AccumulateGrad(Node):
-    def __init__(self):
-        super().__init__('AccumulateGrad')
-        self._post_hook = None
-
-    def construct(self, input):
-        return input
-
-    def bprop(self, input, output, grad):
-        if input.grad is None:
-            input.grad = grad
-        else:
-            input.grad += grad
-
-        if self._post_hook is not None:
-            self._post_hook(input)
-        return grad
-
-    def register_post_hook(self, hook):
-        self._post_hook = hook
-
-
-class Cast(Node):
-    def __init__(self):
-        super().__init__('Cast')
-        self.used_bprop_inputs = []
-
-    def construct(self, input, dtype, device):
-        self.device = input.device
-        self.dtype = input.dtype
-        if device.type == 'cpu':
-            out = cast_cpu(input, dtype).get_value()
-        else:
-            out = cast_npu(input, dtype).get_value()
-        
-        output = core.Tensor.__new__(core.Tensor)
-        MSTensor.__init__(output, out)
-        output.device = device
-        return output
-
-    def bprop(self, *args):
-        grad = args[-1]
-        if self.device.type == 'cpu':
-            out = cast_cpu(grad, self.dtype).get_value()
-        else:
-            out = cast_npu(grad, self.dtype).get_value()
-
-        output = core.Tensor.__new__(core.Tensor)
-        MSTensor.__init__(output, out)
-        output.device = self.device
-        return output, None, None
diff --git a/mindnlp/core/configs.py b/mindnlp/core/configs.py
index c046d3ec7..6a65f786a 100644
--- a/mindnlp/core/configs.py
+++ b/mindnlp/core/configs.py
@@ -5,6 +5,7 @@
 SOC = MSContext.get_instance().get_ascend_soc_version()
 DEVICE_TARGET = mindspore.get_context('device_target')
 SUPPORT_BF16 = SOC in ["ascend910b", "ascend910_93"]
+ON_A1 = not SUPPORT_BF16
 ON_ORANGE_PI = '310b' in SOC
 USE_PYBOOST = DEVICE_TARGET == 'Ascend'
 DEFAULT_DTYPE = mindspore.float32
diff --git a/mindnlp/core/distributed/fsdp/__init__.py b/mindnlp/core/distributed/fsdp/__init__.py
index fa3888cbd..3b6767333 100644
--- a/mindnlp/core/distributed/fsdp/__init__.py
+++ b/mindnlp/core/distributed/fsdp/__init__.py
@@ -1 +1,2 @@
-FullyShardedDataParallel = None
+class FullyShardedDataParallel:
+    pass
diff --git a/mindnlp/core/distributed/tensor/__init__.py b/mindnlp/core/distributed/tensor/__init__.py
index 82f67afae..9790c03e8 100644
--- a/mindnlp/core/distributed/tensor/__init__.py
+++ b/mindnlp/core/distributed/tensor/__init__.py
@@ -1,4 +1,4 @@
 Replicate = None
-DTensor = None
+class DTensor(): pass
 Placement = None
 Shard = None
diff --git a/mindnlp/core/distributions/__init__.py b/mindnlp/core/distributions/__init__.py
index f012e1020..3ca74fcd0 100644
--- a/mindnlp/core/distributions/__init__.py
+++ b/mindnlp/core/distributions/__init__.py
@@ -10,3 +10,4 @@
 from .transforms import *
 from .relaxed_categorical import *
 from .relaxed_bernoulli import *
+from .multivariate_normal import *
\ No newline at end of file
diff --git a/mindnlp/core/distributions/constraints.py b/mindnlp/core/distributions/constraints.py
index 460e83182..ef0311ac8 100644
--- a/mindnlp/core/distributions/constraints.py
+++ b/mindnlp/core/distributions/constraints.py
@@ -33,7 +33,7 @@
 """
 
 import mindspore
-from .. import ops
+from mindnlp import core
 
 
 __all__ = [
@@ -452,7 +452,7 @@ class _Simplex(Constraint):
     event_dim = 1
 
     def check(self, value):
-        return ops.all(value >= 0, dim=-1) & ((value.sum(-1) - 1).abs() < 1e-6)
+        return core.all(value >= 0, dim=-1) & ((value.sum(-1) - 1).abs() < 1e-6)
 
 
 class _Multinomial(Constraint):
@@ -528,7 +528,7 @@ class _Square(Constraint):
     event_dim = 2
 
     def check(self, value):
-        return ops.full(
+        return core.full(
             size=value.shape[:-2],
             fill_value=(value.shape[-2] == value.shape[-1]),
             dtype=mindspore.bool_,
@@ -544,7 +544,7 @@ def check(self, value):
         square_check = super().check(value)
         if not square_check.all():
             return square_check
-        return ops.isclose(value, value.mT, atol=1e-6).all(-2).all(-1)
+        return core.isclose(value, value.mT, atol=1e-6).all(-2).all(-1)
 
 
 class _PositiveSemidefinite(_Symmetric):
@@ -568,7 +568,7 @@ def check(self, value):
         sym_check = super().check(value)
         if not sym_check.all():
             return sym_check
-        return ops.linalg.cholesky_ex(value).info.eq(0)
+        return core.linalg.cholesky_ex(value).info.eq(0)
 
 
 class _Cat(Constraint):
diff --git a/mindnlp/core/distributions/multivariate_normal.py b/mindnlp/core/distributions/multivariate_normal.py
new file mode 100644
index 000000000..a4f25920e
--- /dev/null
+++ b/mindnlp/core/distributions/multivariate_normal.py
@@ -0,0 +1,269 @@
+# mypy: allow-untyped-defs
+import math
+from typing import Optional
+
+from mindnlp import core
+from mindnlp.core import Tensor
+from mindnlp.core.distributions import constraints
+from mindnlp.core.distributions.distribution import Distribution
+from mindnlp.core.distributions.utils import _standard_normal, lazy_property
+from mindnlp.core.types import _size
+
+
+__all__ = ["MultivariateNormal"]
+
+
+def _batch_mv(bmat, bvec):
+    r"""
+    Performs a batched matrix-vector product, with compatible but different batch shapes.
+
+    This function takes as input `bmat`, containing :math:`n \times n` matrices, and
+    `bvec`, containing length :math:`n` vectors.
+
+    Both `bmat` and `bvec` may have any number of leading dimensions, which correspond
+    to a batch shape. They are not necessarily assumed to have the same batch shape,
+    just ones which can be broadcasted.
+    """
+    return core.matmul(bmat, bvec.unsqueeze(-1)).squeeze(-1)
+
+
+def _batch_mahalanobis(bL, bx):
+    r"""
+    Computes the squared Mahalanobis distance :math:`\mathbf{x}^\top\mathbf{M}^{-1}\mathbf{x}`
+    for a factored :math:`\mathbf{M} = \mathbf{L}\mathbf{L}^\top`.
+
+    Accepts batches for both bL and bx. They are not necessarily assumed to have the same batch
+    shape, but `bL` one should be able to broadcasted to `bx` one.
+    """
+    n = bx.size(-1)
+    bx_batch_shape = bx.shape[:-1]
+
+    # Assume that bL.shape = (i, 1, n, n), bx.shape = (..., i, j, n),
+    # we are going to make bx have shape (..., 1, j,  i, 1, n) to apply batched tri.solve
+    bx_batch_dims = len(bx_batch_shape)
+    bL_batch_dims = bL.dim() - 2
+    outer_batch_dims = bx_batch_dims - bL_batch_dims
+    old_batch_dims = outer_batch_dims + bL_batch_dims
+    new_batch_dims = outer_batch_dims + 2 * bL_batch_dims
+    # Reshape bx with the shape (..., 1, i, j, 1, n)
+    bx_new_shape = bx.shape[:outer_batch_dims]
+    for sL, sx in zip(bL.shape[:-2], bx.shape[outer_batch_dims:-1]):
+        bx_new_shape += (sx // sL, sL)
+    bx_new_shape += (n,)
+    bx = bx.reshape(bx_new_shape)
+    # Permute bx to make it have shape (..., 1, j, i, 1, n)
+    permute_dims = (
+        list(range(outer_batch_dims))
+        + list(range(outer_batch_dims, new_batch_dims, 2))
+        + list(range(outer_batch_dims + 1, new_batch_dims, 2))
+        + [new_batch_dims]
+    )
+    bx = bx.permute(permute_dims)
+
+    flat_L = bL.reshape(-1, n, n)  # shape = b x n x n
+    flat_x = bx.reshape(-1, flat_L.size(0), n)  # shape = c x b x n
+    flat_x_swap = flat_x.permute(1, 2, 0)  # shape = b x n x c
+    M_swap = (
+        core.linalg.solve_triangular(flat_L, flat_x_swap, upper=False).pow(2).sum(-2)
+    )  # shape = b x c
+    M = M_swap.t()  # shape = c x b
+
+    # Now we revert the above reshape and permute operators.
+    permuted_M = M.reshape(bx.shape[:-1])  # shape = (..., 1, j, i, 1)
+    permute_inv_dims = list(range(outer_batch_dims))
+    for i in range(bL_batch_dims):
+        permute_inv_dims += [outer_batch_dims + i, old_batch_dims + i]
+    reshaped_M = permuted_M.permute(permute_inv_dims)  # shape = (..., 1, i, j, 1)
+    return reshaped_M.reshape(bx_batch_shape)
+
+
+def _precision_to_scale_tril(P):
+    # Ref: https://nbviewer.jupyter.org/gist/fehiepsi/5ef8e09e61604f10607380467eb82006#Precision-to-scale_tril
+    Lf = core.linalg.cholesky(core.flip(P, (-2, -1)))
+    L_inv = core.transpose(core.flip(Lf, (-2, -1)), -2, -1)
+    Id = core.eye(P.shape[-1], dtype=P.dtype, device=P.device)
+    L = core.linalg.solve_triangular(L_inv, Id, upper=False)
+    return L
+
+
+class MultivariateNormal(Distribution):
+    r"""
+    Creates a multivariate normal (also called Gaussian) distribution
+    parameterized by a mean vector and a covariance matrix.
+
+    The multivariate normal distribution can be parameterized either
+    in terms of a positive definite covariance matrix :math:`\mathbf{\Sigma}`
+    or a positive definite precision matrix :math:`\mathbf{\Sigma}^{-1}`
+    or a lower-triangular matrix :math:`\mathbf{L}` with positive-valued
+    diagonal entries, such that
+    :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`. This triangular matrix
+    can be obtained via e.g. Cholesky decomposition of the covariance.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = MultivariateNormal(core.zeros(2), core.eye(2))
+        >>> m.sample()  # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
+        tensor([-0.2102, -0.5429])
+
+    Args:
+        loc (Tensor): mean of the distribution
+        covariance_matrix (Tensor): positive-definite covariance matrix
+        precision_matrix (Tensor): positive-definite precision matrix
+        scale_tril (Tensor): lower-triangular factor of covariance, with positive-valued diagonal
+
+    Note:
+        Only one of :attr:`covariance_matrix` or :attr:`precision_matrix` or
+        :attr:`scale_tril` can be specified.
+
+        Using :attr:`scale_tril` will be more efficient: all computations internally
+        are based on :attr:`scale_tril`. If :attr:`covariance_matrix` or
+        :attr:`precision_matrix` is passed instead, it is only used to compute
+        the corresponding lower triangular matrices using a Cholesky decomposition.
+    """
+
+    arg_constraints = {
+        "loc": constraints.real_vector,
+        "covariance_matrix": constraints.positive_definite,
+        "precision_matrix": constraints.positive_definite,
+        "scale_tril": constraints.lower_cholesky,
+    }
+    support = constraints.real_vector
+    has_rsample = True
+
+    def __init__(
+        self,
+        loc: Tensor,
+        covariance_matrix: Optional[Tensor] = None,
+        precision_matrix: Optional[Tensor] = None,
+        scale_tril: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+        if loc.dim() < 1:
+            raise ValueError("loc must be at least one-dimensional.")
+        if (covariance_matrix is not None) + (scale_tril is not None) + (
+            precision_matrix is not None
+        ) != 1:
+            raise ValueError(
+                "Exactly one of covariance_matrix or precision_matrix or scale_tril may be specified."
+            )
+
+        if scale_tril is not None:
+            if scale_tril.dim() < 2:
+                raise ValueError(
+                    "scale_tril matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = core.broadcast_shapes(scale_tril.shape[:-2], loc.shape[:-1])
+            self.scale_tril = scale_tril.expand(batch_shape + (-1, -1))
+        elif covariance_matrix is not None:
+            if covariance_matrix.dim() < 2:
+                raise ValueError(
+                    "covariance_matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = core.broadcast_shapes(
+                covariance_matrix.shape[:-2], loc.shape[:-1]
+            )
+            self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
+        else:
+            assert precision_matrix is not None  # helps mypy
+            if precision_matrix.dim() < 2:
+                raise ValueError(
+                    "precision_matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = core.broadcast_shapes(
+                precision_matrix.shape[:-2], loc.shape[:-1]
+            )
+            self.precision_matrix = precision_matrix.expand(batch_shape + (-1, -1))
+        self.loc = loc.expand(batch_shape + (-1,))
+
+        event_shape = self.loc.shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+        if scale_tril is not None:
+            self._unbroadcasted_scale_tril = scale_tril
+        elif covariance_matrix is not None:
+            self._unbroadcasted_scale_tril = core.linalg.cholesky(covariance_matrix)
+        else:  # precision_matrix is not None
+            self._unbroadcasted_scale_tril = _precision_to_scale_tril(precision_matrix)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(MultivariateNormal, _instance)
+        batch_shape = core.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        cov_shape = batch_shape + self.event_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril
+        if "covariance_matrix" in self.__dict__:
+            new.covariance_matrix = self.covariance_matrix.expand(cov_shape)
+        if "scale_tril" in self.__dict__:
+            new.scale_tril = self.scale_tril.expand(cov_shape)
+        if "precision_matrix" in self.__dict__:
+            new.precision_matrix = self.precision_matrix.expand(cov_shape)
+        super(MultivariateNormal, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @lazy_property
+    def scale_tril(self) -> Tensor:
+        return self._unbroadcasted_scale_tril.expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @lazy_property
+    def covariance_matrix(self) -> Tensor:
+        return core.matmul(
+            self._unbroadcasted_scale_tril, self._unbroadcasted_scale_tril.mT
+        ).expand(self._batch_shape + self._event_shape + self._event_shape)
+
+    @lazy_property
+    def precision_matrix(self) -> Tensor:
+        return core.cholesky_inverse(self._unbroadcasted_scale_tril).expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @property
+    def mean(self) -> Tensor:
+        return self.loc
+
+    @property
+    def mode(self) -> Tensor:
+        return self.loc
+
+    @property
+    def variance(self) -> Tensor:
+        return (
+            self._unbroadcasted_scale_tril.pow(2)
+            .sum(-1)
+            .expand(self._batch_shape + self._event_shape)
+        )
+
+    def rsample(self, sample_shape: _size = core.Size()) -> Tensor:
+        shape = self._extended_shape(sample_shape)
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
+        return self.loc + _batch_mv(self._unbroadcasted_scale_tril, eps)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        diff = value - self.loc
+        M = _batch_mahalanobis(self._unbroadcasted_scale_tril, diff)
+        half_log_det = (
+            self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+        )
+        return -0.5 * (self._event_shape[0] * math.log(2 * math.pi) + M) - half_log_det
+
+    def entropy(self):
+        half_log_det = (
+            self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+        )
+        H = 0.5 * self._event_shape[0] * (1.0 + math.log(2 * math.pi)) + half_log_det
+        if len(self._batch_shape) == 0:
+            return H
+        else:
+            return H.expand(self._batch_shape)
\ No newline at end of file
diff --git a/mindnlp/core/distributions/utils.py b/mindnlp/core/distributions/utils.py
index 596f1d68b..cdb6e3b8d 100644
--- a/mindnlp/core/distributions/utils.py
+++ b/mindnlp/core/distributions/utils.py
@@ -58,7 +58,7 @@ def broadcast_all(*values):
     return ops.broadcast_tensors(*values)
 
 
-def _standard_normal(shape, dtype):
+def _standard_normal(shape, dtype, device=None):
     return ops.normal(size = shape).to(dtype)
 
 
diff --git a/mindnlp/core/linalg/__init__.py b/mindnlp/core/linalg/__init__.py
new file mode 100644
index 000000000..25891956c
--- /dev/null
+++ b/mindnlp/core/linalg/__init__.py
@@ -0,0 +1,23 @@
+from collections import namedtuple
+from mindspore import ops
+from mindspore.ops._primitive_cache import _get_cache_prim
+
+from mindnlp import core
+
+linalg_cholesky_ex = namedtuple('linalg_cholesky_ex', ['L', 'info'])
+
+def cholesky(A, *, upper=False, out=None):
+    cholesky_op = _get_cache_prim(ops.Cholesky)(upper=upper).set_device('CPU')
+    return cholesky_op(A)
+
+def cholesky_ex(A, *, upper=False, check_errors=False, out=None):
+    try:
+        out = cholesky(A, upper=upper, out=out)
+        out + 1
+        info = core.Tensor(0)
+    except:
+        info = core.Tensor(1)
+        out = A
+    return linalg_cholesky_ex(out, info)
+
+
diff --git a/mindnlp/core/nn/__init__.py b/mindnlp/core/nn/__init__.py
index c3eecf6d8..0de29ba94 100644
--- a/mindnlp/core/nn/__init__.py
+++ b/mindnlp/core/nn/__init__.py
@@ -16,3 +16,4 @@
 from . import utils, functional, init
 from .modules import *
 from .parameter import Parameter
+from .parallel import DataParallel as DataParallel
\ No newline at end of file
diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py
index e7db4975d..241df33ae 100644
--- a/mindnlp/core/nn/functional.py
+++ b/mindnlp/core/nn/functional.py
@@ -10,7 +10,7 @@
 
 from mindnlp import core
 from mindnlp.core.executor import execute
-from ..configs import DEVICE_TARGET, ON_ORANGE_PI, use_pyboost
+from ..configs import DEVICE_TARGET, ON_ORANGE_PI, use_pyboost, ON_A1
 
 generator_step_ = 12
 
@@ -152,15 +152,12 @@ def avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, coun
 
     return ops.avg_pool2d(input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
 
-def dropout(input, p=0.5, training=True, inplace=False):
-    if not training:
-        return input
-    seed, offset = default_generator._step(generator_step_)  # pylint: disable=protected-access
-    out, _ = execute('dropout_ext', input, p, seed, offset)
-    if inplace:
-        input.copy_(out)
+def dropout(input, p=0.5, training=True):
+    if not training or p == 0:
         return input
-    return out
+    if use_pyboost() and not ON_ORANGE_PI:
+        return mint.nn.functional.dropout(input, p, training)
+    return ops.dropout(input, p, training)
 
 def dropout2d(input, p=0.5, training=False):
     return ops.dropout2d(input, p, training)
@@ -211,11 +208,15 @@ def gumbel_softmax(logits: core.Tensor, tau: float = 1, hard: bool = False, eps:
         ret = y_soft
     return ret
 
-def log_softmax(input, dim=-1, dtype=None):
-    if input.device.type == 'cpu':
-        return execute('log_softmax', input, dim)
-    return execute('log_softmax_ext', input, dim,
-                   dtype if dtype is None else dtype_to_type_id('LogSoftmaxExt', 'dtype', dtype))
+def log_softmax(input, dim=None, dtype=None):
+    if use_pyboost():
+        return mint.nn.functional.log_softmax(input, dim=dim, dtype=dtype)
+    if dim is None:
+        dim = -1
+    out = ops.log_softmax(input, dim)
+    if dtype is not None:
+        out = out.to(dtype)
+    return out
 
 def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False):
     if use_pyboost():
@@ -259,24 +260,15 @@ def _replication_pad(input, pad):
     return out
 
 def pad(input, pad, mode='constant', value=0.0):
-    out = input
-    if (isinstance(pad, tuple) and not pad):
-        return out
-    if mode == "constant":
-        value = 0 if value is None else value
-        out = execute('constant_pad_nd', input, pad, value)
-    else:
-        if value is not None and value != 0:
-            raise ValueError(f"Padding mode {mode} doesn\'t take in value argument.")
-        if mode == "circular":
-            out = _circular_pad(input, pad)
-        elif mode == "reflect":
-            out = _reflection_pad(input, pad)
-        elif mode == "replicate":
-            out = _replication_pad(input, pad)
-        else:
-            raise ValueError(f"Pad filling mode must be 'constant' 'circular' 'reflect' or 'replicate'.")
-    return out
+    if sum(pad) == 0:
+        return input
+    if isinstance(pad, tuple):
+        pad = tuple(p if isinstance(p, int) else p.item() for p in pad)
+    if use_pyboost() and not ON_A1:
+        return mint.nn.functional.pad(input, pad, mode, value)
+    if mode in ['reflect', 'circular']:
+        return ops.pad(input, pad, mode)
+    return ops.pad(input, pad, mode, value)
 
 def nll_loss(input, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
     return _inner_nll_loss(input, target, weight, ignore_index, reduction, label_smoothing)
@@ -338,7 +330,7 @@ def _nll_loss(inputs, target, target_dim=-1, weight=None, ignore_index=None, red
     if target.ndim == inputs.ndim - 1:
         target = target.unsqueeze(target_dim)
     if ignore_index is not None:
-        non_pad_mask = core.equal(target, ignore_index)
+        non_pad_mask = core.eq(target, ignore_index)
         target = target.masked_fill(non_pad_mask, 0)
     else:
         non_pad_mask = target
diff --git a/mindnlp/core/nn/modules/linear.py b/mindnlp/core/nn/modules/linear.py
index 43ee37f9b..23d0e5b7a 100644
--- a/mindnlp/core/nn/modules/linear.py
+++ b/mindnlp/core/nn/modules/linear.py
@@ -29,13 +29,13 @@ class Linear(Module):
     Examples::
 
         >>> m = nn.Linear(20, 30)
-        >>> input = autograd.Variable(core.randn(128, 20))
+        >>> input = autograd.Variable(torch.randn(128, 20))
         >>> output = m(input)
         >>> print(output.size())
     """
 
-    def __init__(self, in_features, out_features, bias=True, dtype=None) -> None:
-        factory_kwargs = {'dtype': dtype}
+    def __init__(self, in_features, out_features, bias=True, dtype=None, device=None) -> None:
+        factory_kwargs = {'dtype': dtype, 'device': device}
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -80,10 +80,10 @@ class Identity(Module):
     Examples::
 
         >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
-        >>> input = core.randn(128, 20)
+        >>> input = torch.randn(128, 20)
         >>> output = m(input)
         >>> print(output.size())
-        core.Size([128, 20])
+        torch.Size([128, 20])
 
     """
 
diff --git a/mindnlp/core/nn/modules/sparse.py b/mindnlp/core/nn/modules/sparse.py
index 5712d68a4..88642422d 100644
--- a/mindnlp/core/nn/modules/sparse.py
+++ b/mindnlp/core/nn/modules/sparse.py
@@ -32,7 +32,7 @@ class Embedding(Module):
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
                  max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
                  sparse: bool = False, _weight: Optional[Tensor] = None, _freeze: bool = False,
-                 dtype=None) -> None:
+                 dtype=None, device=None) -> None:
         factory_kwargs = {'dtype': dtype}
         super().__init__()
         self.num_embeddings = num_embeddings
@@ -107,10 +107,10 @@ def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
         Examples::
 
             >>> # FloatTensor containing pretrained weights
-            >>> weight = core.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
             >>> embedding = nn.Embedding.from_pretrained(weight)
             >>> # Get embeddings for index 1
-            >>> input = core.LongTensor([1])
+            >>> input = torch.LongTensor([1])
             >>> # xdoctest: +IGNORE_WANT("non-deterministic")
             >>> embedding(input)
             tensor([[ 4.0000,  5.1000,  6.3000]])
diff --git a/mindnlp/core/nn/parallel/__init__.py b/mindnlp/core/nn/parallel/__init__.py
index e69de29bb..04d8b0c5e 100644
--- a/mindnlp/core/nn/parallel/__init__.py
+++ b/mindnlp/core/nn/parallel/__init__.py
@@ -0,0 +1 @@
+from .distributed import DistributedDataParallel, DataParallel
\ No newline at end of file
diff --git a/mindnlp/core/nn/parallel/distributed.py b/mindnlp/core/nn/parallel/distributed.py
index aefe06666..5961ad131 100644
--- a/mindnlp/core/nn/parallel/distributed.py
+++ b/mindnlp/core/nn/parallel/distributed.py
@@ -1,4 +1,7 @@
 from ..modules import Module
 
 class DistributedDataParallel(Module):
-    pass
\ No newline at end of file
+    pass
+
+class DataParallel(Module):
+    pass
diff --git a/mindnlp/core/ops/__init__.py b/mindnlp/core/ops/__init__.py
index 5f14d9d90..e4b029cbd 100644
--- a/mindnlp/core/ops/__init__.py
+++ b/mindnlp/core/ops/__init__.py
@@ -1,6 +1,6 @@
 """core ops like torch funcional api"""
 from . import array, blas, comparison, pointwise, creation, random, reduction, other, \
-    tensor, _inner, optim
+    tensor, _inner, optim, inplace
 from .array import *
 from .blas import *
 from .comparison import *
@@ -14,6 +14,7 @@
 # from .spectral import *
 from ._inner import *
 from .optim import *
+from .inplace import *
 
 def load_library(lib_path):
     raise ImportError('not support import any ops for now.')
@@ -34,3 +35,4 @@ def load_library(lib_path):
 __all__.extend(tensor.__all__)
 __all__.extend(other.__all__)
 __all__.extend(optim.__all__)
+__all__.extend(inplace.__all__)
diff --git a/mindnlp/core/ops/array.py b/mindnlp/core/ops/array.py
index ba54bf936..571293abb 100644
--- a/mindnlp/core/ops/array.py
+++ b/mindnlp/core/ops/array.py
@@ -9,6 +9,7 @@
 from ..configs import use_pyboost, ON_ORANGE_PI
 from .other import broadcast_tensors
 from ._inner import call_ms_func
+from .creation import arange
 
 # adjoint
 
@@ -20,7 +21,10 @@ def argwhere(input):
 
 # cat
 has_cat = hasattr(mindspore.mint, 'cat')
-def cat(tensors, dim=0, *, out=None):
+def cat(tensors, dim=0, *, out=None, **kwargs):
+    axis = kwargs.get('axis', None)
+    if axis is not None:
+        dim = axis
     if use_pyboost() and has_cat:
         return call_ms_func(mindspore.mint.cat, tensors, dim, out=out)
     return call_ms_func(ops.cat, tensors, dim, out=out)
@@ -223,7 +227,7 @@ def split(tensor, split_size_or_sections, dim=0):
 
 # squeeze
 has_squeeze = hasattr(mindspore.mint, 'squeeze')
-def squeeze(input, dim=None):
+def squeeze(input, *dim):
     if use_pyboost() and has_squeeze:
         return mindspore.mint.squeeze(input, dim)
     return ops.squeeze(input, dim)
@@ -253,15 +257,58 @@ def take(input, index):
         return tf_gather(input, index, 0).view(index_shape)
     return gather(input, 0, index).view(index_shape)
 
-# take_along_dim
+def infer_size_impl(a, b):
+    lenA = len(a)
+    lenB = len(b)
+    ndim = max(lenA, lenB)
+    expanded_sizes = [0] * ndim
+
+    for i in range(ndim - 1, -1, -1):
+        offset = ndim - 1 - i
+        dimA = lenA - 1 - offset
+        dimB = lenB - 1 - offset
+        
+        sizeA = a[dimA] if dimA >= 0 else 1
+        sizeB = b[dimB] if dimB >= 0 else 1
+
+        # 检查维度兼容性
+        if not (sizeA == sizeB or sizeA == 1 or sizeB == 1):
+            raise RuntimeError(
+                f"The size of tensor a ({sizeA}) must match the size of tensor b ({sizeB}) "
+                f"at non-singleton dimension {i}"
+            )
+
+        # 应用广播规则：优先选择非1的维度大小
+        expanded_sizes[i] = sizeB if sizeA == 1 else sizeA
+
+    return expanded_sizes
+
 
+def _take_along_dim_helper(self, indices, dim):
+    assert self.dim() == indices.dim(), f"torch.take_along_dim(): input and indices should have the same number of dimensions, " \
+        f"but got {self.dim()} dimensions for input, and {indices.dim()} dimensions for indices"
+    dim = self.dim() + dim if dim < 0 else dim
+    self_sizes = list(self.shape)
+    self_sizes[dim] = indices.size(dim)
+    broadcast_shape = infer_size_impl(self_sizes, indices.shape)
+    indices_broadcasted = indices.broadcast_to(broadcast_shape)
+
+    indices_sizes = list(indices.shape)
+    indices_sizes[dim] = self.size(dim)
+    broadcast_shape = infer_size_impl(indices_sizes, self.shape)
+    self_broadcasted = self.broadcast_to(broadcast_shape)
+
+    return self_broadcasted, indices_broadcasted, dim
+
+# take_along_dim
+def take_along_dim(input, indices, dim=None, *, out=None):
+    if dim:
+        self_broadcasted, indices_broadcasted, dim = _take_along_dim_helper(input, indices, dim)
+        return self_broadcasted.gather(dim, indices_broadcasted)
+    return input.view(-1).gather(0, indices.view(-1))
 
 # tensor_split
 def tensor_split(input, indices_or_sections, dim=0):
-    if isinstance(indices_or_sections, mindspore.Tensor):
-        indices_or_sections = indices_or_sections.tolist()
-    else:
-        indices_or_sections = tuple([get_item(t) for t in indices_or_sections])
     return ops.tensor_split(input, indices_or_sections, dim)
 
 # tile
@@ -710,7 +757,7 @@ def strided_slice_update(input, begin, end, strides, update, begin_mask=0, end_m
     'swapaxes',
     'swapdims',
     'take',
-    # take_along_dim
+    'take_along_dim',
     'tensor_split',
     'tile',
     'transpose',
diff --git a/mindnlp/core/ops/creation.py b/mindnlp/core/ops/creation.py
index 657c3e84f..5bb550a32 100644
--- a/mindnlp/core/ops/creation.py
+++ b/mindnlp/core/ops/creation.py
@@ -87,13 +87,24 @@ def ones_like(input, *, dtype=None, device=None):
     return ops.ones_like(input, dtype=dtype)
 
 # arange
+range_op = ops.Range()
 has_arange = hasattr(mindspore.mint, 'arange')
 def arange(start=0, end=None, step=1, *, dtype=None, device=None):
     if ON_ORANGE_PI and dtype in (None, mindspore.int64):
         dtype = mindspore.int32
     if use_pyboost() and has_arange:
+        start = start.item() if isinstance(start, mindspore.Tensor) else start
+        end = end.item() if isinstance(end, mindspore.Tensor) else end
+        step = step.item() if isinstance(step, mindspore.Tensor) else step
         return mindspore.mint.arange(start, end, step, dtype=dtype)
-    return ops.arange(start, end, step, dtype=dtype)
+
+    start = mindspore.Tensor(start) if not isinstance(start, mindspore.Tensor) else start
+    end = mindspore.Tensor(end) if not isinstance(start, mindspore.Tensor) else end
+    step = mindspore.Tensor(step) if not isinstance(start, mindspore.Tensor) else step
+    out = range_op(start, end, step)
+    if dtype:
+        out = out.to(dtype)
+    return out
 
 # range
 def range(start=0, end=None, step=1, dtype=None):
diff --git a/mindnlp/core/ops/inplace.py b/mindnlp/core/ops/inplace.py
index 107a82dce..45655f62f 100644
--- a/mindnlp/core/ops/inplace.py
+++ b/mindnlp/core/ops/inplace.py
@@ -1,6 +1,7 @@
-
 from mindspore.common.generator import default_generator
+from mindspore.ops.auto_generate.gen_ops_prim import inplace_normal_op
 
+from mindnlp import core
 
 generator_step_ = 12
 
@@ -39,10 +40,11 @@ def inplace_normal(input, mean=0, std=1, *, generator=None):
     if generator is None:
         generator = default_generator
     seed, offset = generator._step(generator_step_)
-    if input.device.type == 'npu':
-        execute('inplace_normal', input, mean, std, seed, offset)
-    elif input.device.type == 'cpu':
-        core.normal(mean, std, size=input.size, generator=generator, out=input)
+    if isinstance(mean, core.Tensor):
+        mean = mean.item()
+    if isinstance(std, core.Tensor):
+        std = std.item()
+    inplace_normal_op(input, mean, std, seed, offset)
 
     return input
 
diff --git a/mindnlp/core/ops/other.py b/mindnlp/core/ops/other.py
index d786a96cd..e57faaba5 100644
--- a/mindnlp/core/ops/other.py
+++ b/mindnlp/core/ops/other.py
@@ -7,7 +7,7 @@
 from mindspore.common.initializer import initializer
 from mindspore.ops._primitive_cache import _get_cache_prim
 
-from ..configs import use_pyboost, ON_ORANGE_PI
+from ..configs import use_pyboost, ON_ORANGE_PI, ON_A1
 from .reduction import any
 from .comparison import eq
 from ._inner import call_ms_func
@@ -710,19 +710,20 @@ def meshgrid(*tensors, indexing=None):
 
 
 # repeat_interleave
-has_repeat_interleave = hasattr(mindspore.mint, "repeat_interleave")
-
-
-def repeat_interleave(*args, **kwargs):
-    if use_pyboost() and has_repeat_interleave:
-        return mindspore.mint.repeat_interleave(*args, **kwargs)
-
-    input, repeats, dim = args.get("input"), args.get("repeats"), args.get("dim")
+has_repeat_interleave = hasattr(mindspore.mint, 'repeat_interleave')
+def repeat_interleave(input, repeats, dim=None):
+    if use_pyboost() and has_repeat_interleave and not ON_A1:
+        return mindspore.mint.repeat_interleave(input, repeats, dim=dim)
     if input.dtype == mindspore.bool_:
         input = input.int()
-        return input.repeat(repeats, dim).bool()
-    return input.repeat(repeats, dim)
-
+    new_shape = list(input.shape)
+    new_shape.insert(dim+1, repeats)
+    expanded = input.unsqueeze(dim+1).expand(new_shape)
+
+    final_shape = new_shape.copy()
+    final_shape[dim] *= final_shape[dim+1]
+    del final_shape[dim+1]
+    return expanded.reshape(final_shape)
 
 # roll
 DEVICE_TARGET = mindspore.get_context("device_target")
diff --git a/mindnlp/core/ops/random.py b/mindnlp/core/ops/random.py
index f66f58f30..4aed3df98 100644
--- a/mindnlp/core/ops/random.py
+++ b/mindnlp/core/ops/random.py
@@ -3,7 +3,7 @@
 import mindspore
 from mindspore import ops
 from mindspore.ops._primitive_cache import _get_cache_prim
-from ..configs import use_pyboost, DEVICE_TARGET
+from ..configs import use_pyboost, DEVICE_TARGET, ON_A1
 from .other import cumsum, searchsorted
 from .comparison import topk
 from .pointwise import div, log
@@ -27,7 +27,7 @@ def bernoulli(input, *, generator=None, out=None):
 has_multinomial = hasattr(mindspore.mint, 'multinomial')
 def multinomial(input, num_samples, replacement=False, *, generator=None):
     """custom multinomial"""
-    if use_pyboost() and has_multinomial:
+    if use_pyboost() and has_multinomial and not ON_A1:
         return mindspore.mint.multinomial(input, num_samples, replacement=replacement, generator=generator)
     if replacement:
         # with replacement
diff --git a/mindnlp/core/ops/reduction.py b/mindnlp/core/ops/reduction.py
index 1789c991a..1da14ab25 100644
--- a/mindnlp/core/ops/reduction.py
+++ b/mindnlp/core/ops/reduction.py
@@ -42,7 +42,14 @@ def aminmax(input, *, dim=None, keepdim=False):
 
 # all
 has_all = hasattr(mindspore.mint, 'all')
-def all(input, dim=None, keepdim=False, *, dtype=None):
+def all(input, dim=None, keepdim=False, *, dtype=None, **kwargs):
+    axis = kwargs.get('axis', None)
+    keepdims = kwargs.get('keepdims', None)
+    if axis is not None:
+        dim = axis
+    if keepdims:
+        keepdim = keepdims
+
     if use_pyboost() and has_all:
         return mindspore.mint.all(input, dim, keepdim).to(input.dtype)
     return ops.all(input, dim, keepdim).to(input.dtype)
@@ -79,7 +86,10 @@ def logsumexp(input, dim, keepdim=False):
 
 # mean
 has_mean = hasattr(mindspore.mint, 'mean')
-def mean(input, dim=None, keepdim=False, *, dtype=None):
+def mean(input, dim=None, keepdim=False, *, dtype=None, **kwargs):
+    axis = kwargs.get('axis', None)
+    if axis is not None:
+        dim = axis
     if use_pyboost() and has_mean:
         return mindspore.mint.mean(input, dim, keepdim, dtype=dtype)
     out = ops.mean(input, dim, keepdim)
@@ -135,7 +145,10 @@ def nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear'):
 
 # std
 has_std = hasattr(mindspore.mint, 'std')
-def std(input, dim=None, *, correction=1, keepdim=False):
+def std(input, dim=None, *, correction=1, keepdim=False, **kwargs):
+    axis = kwargs.get('axis', None)
+    if axis is not None:
+        dim = axis
     if use_pyboost() and has_std:
         return mindspore.mint.std(input, dim=dim, correction=correction, keepdim=keepdim)
     if DEVICE_TARGET == 'GPU':
diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index ac9dce42d..049a81e42 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -1235,11 +1235,11 @@ class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
         # Lets us override the imports that pickle uses when unpickling an object.
         # This is useful for maintaining BC if we change a module path that tensor instantiation relies on.
         def find_class(self, mod_name, name):
-            if mod_name == 'core._utils':
+            if mod_name == 'torch._utils':
                 return eval(name)
             if mod_name == 'torch':
                 return str(name)
-            if mod_name == 'core._tensor':
+            if mod_name == 'torch._tensor':
                 return eval(name)
             mod_name = load_module_mapping.get(mod_name, mod_name)
             return super().find_class(mod_name, name)
diff --git a/mindnlp/core/storage.py b/mindnlp/core/storage.py
index 47f6a5770..45cb25527 100644
--- a/mindnlp/core/storage.py
+++ b/mindnlp/core/storage.py
@@ -12,12 +12,12 @@
 from typing_extensions import Self
 
 from mindnlp import core
-from core._utils import _to, _type
-from core.types import _bool, _int, Storage
+from ._utils import _to, _type
+from .types import _bool, _int, Storage
 
 
 if TYPE_CHECKING:
-    from core._prims_common import DeviceLikeType
+    from ._prims_common import DeviceLikeType
 
 
 __all__ = ["TypedStorage", "UntypedStorage"]
@@ -525,7 +525,10 @@ def _share_filename_cpu_(self, *args, **kwargs):
         return super()._share_filename_cpu_(*args, **kwargs)
 
     def data_ptr(self):
-        return self.data.ctypes.data
+        if isinstance(self.data, np.ndarray):
+            return self.data.ctypes.data
+        else:
+            return self.data.data_ptr()
 
     def nbytes(self):
         return self.data.nbytes
diff --git a/mindnlp/dataset/load.py b/mindnlp/dataset/load.py
index 6a8c9424d..6ae103cc1 100644
--- a/mindnlp/dataset/load.py
+++ b/mindnlp/dataset/load.py
@@ -16,13 +16,11 @@
 """
 load
 """
-import os
 from typing import Union, Optional, Dict, Sequence, Mapping
 from datasets import load_dataset as hf_load
 from datasets import Dataset, IterableDataset, Split, Features, \
     DownloadConfig, DownloadMode, VerificationMode, Version
 from mindspore.dataset import GeneratorDataset
-from mindspore.communication import get_rank, get_group_size
 
 
 class TransferIterableDataset():
diff --git a/mindnlp/core/_dynamo/__init__.py b/mindnlp/diffusers/__init__.py
similarity index 100%
rename from mindnlp/core/_dynamo/__init__.py
rename to mindnlp/diffusers/__init__.py
diff --git a/mindnlp/engine/trainer/base.py b/mindnlp/engine/trainer/base.py
index bb76fadab..07a8d16a7 100644
--- a/mindnlp/engine/trainer/base.py
+++ b/mindnlp/engine/trainer/base.py
@@ -41,13 +41,12 @@
 from ...core.autograd import value_and_grad
 from ...core.serialization import safe_load_file, safe_save_file, save, save_checkpoint, load, load_checkpoint
 from ...peft import PeftModel
-from ...configs import WEIGHTS_NAME, CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME, \
-    WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 from ...dataset import BaseMapFunction
 from ...utils import logging, find_labels, can_return_loss
 from ...accelerate.utils import DistributedType
-from ...accelerate.utils import accelerate_distributed_type
 from ...utils.import_utils import is_safetensors_available
+from ...transformers.utils import WEIGHTS_NAME, CONFIG_NAME, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME, \
+    ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME
 from ...transformers.modeling_utils import PreTrainedModel
 from ...transformers.configuration_utils import PretrainedConfig
 from ...transformers.tokenization_utils_base import PreTrainedTokenizerBase
@@ -285,7 +284,6 @@ def __init__(
         # Internal variables to help with automatic batch size reduction
         self._train_batch_size = args.train_batch_size
         self._created_lr_scheduler = False
-        self.actual_distributed_type = accelerate_distributed_type
 
 
     def _get_learning_rate(self):
@@ -1377,20 +1375,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[mindspore.Tensor, Any]]) -> Di
 
         return inputs
 
-
-    def update_gradient_by_distributed_type(self, model: nn.Module) -> None:
-        """update gradient by distributed_type"""
-        if accelerate_distributed_type == DistributedType.NO:
-            return
-        if accelerate_distributed_type == DistributedType.MULTI_NPU:
-            from mindspore.communication import get_group_size
-            from mindspore.communication.comm_func import all_reduce
-            rank_size = get_group_size()
-            for parameter in model.parameters():
-                new_grads_mean = all_reduce(parameter.grad) / rank_size
-                parameter.grad = new_grads_mean
-
-
     def training_step(self, model: nn.Module, inputs: Dict[str, Union[mindspore.Tensor, Any]]) -> Tuple[List[mindspore.Tensor], mindspore.Tensor]:
         """
         Perform a training step on a batch of inputs.
@@ -1422,7 +1406,6 @@ def forward(inputs):
             self.grad_fn = value_and_grad(forward, weights, attach_grads=True)
 
         loss = self.grad_fn(inputs)
-        self.update_gradient_by_distributed_type(model)
         return loss / self.args.gradient_accumulation_steps
 
     def compute_loss(self, model, inputs, return_outputs=False):
diff --git a/mindnlp/engine/utils.py b/mindnlp/engine/utils.py
index 0ed550cb8..45e36c078 100644
--- a/mindnlp/engine/utils.py
+++ b/mindnlp/engine/utils.py
@@ -29,8 +29,7 @@
 import mindspore
 
 from mindnlp.core import ops
-from core.nn import functional as F
-from mindnlp.configs import GENERATOR_SEED
+from mindnlp.core.nn import functional as F
 from mindnlp.utils import is_mindspore_available, ExplicitEnum
 
 
@@ -215,8 +214,7 @@ def set_seed(seed: int):
     np.random.seed(seed)
     if is_mindspore_available():
         mindspore.set_seed(seed)
-        if GENERATOR_SEED:
-            mindspore.manual_seed(seed)
+        mindspore.manual_seed(seed)
 
 def enable_full_determinism(seed: int, warn_only: bool = False):
     """
diff --git a/mindnlp/sentence/__init__.py b/mindnlp/sentence/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/timm/__init__.py b/mindnlp/timm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/transformers/__init__.py b/mindnlp/transformers/__init__.py
index e02e58ecb..a5b1c4060 100644
--- a/mindnlp/transformers/__init__.py
+++ b/mindnlp/transformers/__init__.py
@@ -3,7 +3,6 @@
 from transformers.utils import OptionalDependencyNotAvailable, _LazyModule
 from transformers.utils.import_utils import *
 
-
 # Base objects, independent of any specific backend
 _import_structure = {
     # "agents": [
@@ -4102,342 +4101,7 @@
     _import_structure["models.musicgen_melody"].append("MusicgenMelodyProcessor")
 
 
-# FLAX-backed objects
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from transformers.utils import dummy_flax_objects
-
-    _import_structure["utils.dummy_flax_objects"] = [
-        name for name in dir(dummy_flax_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["generation"].extend(
-        [
-            "FlaxForcedBOSTokenLogitsProcessor",
-            "FlaxForcedEOSTokenLogitsProcessor",
-            "FlaxForceTokensLogitsProcessor",
-            "FlaxGenerationMixin",
-            "FlaxLogitsProcessor",
-            "FlaxLogitsProcessorList",
-            "FlaxLogitsWarper",
-            "FlaxMinLengthLogitsProcessor",
-            "FlaxTemperatureLogitsWarper",
-            "FlaxSuppressTokensAtBeginLogitsProcessor",
-            "FlaxSuppressTokensLogitsProcessor",
-            "FlaxTopKLogitsWarper",
-            "FlaxTopPLogitsWarper",
-            "FlaxWhisperTimeStampLogitsProcessor",
-        ]
-    )
-    _import_structure["modeling_flax_outputs"] = []
-    _import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"]
-    _import_structure["models.albert"].extend(
-        [
-            "FlaxAlbertForMaskedLM",
-            "FlaxAlbertForMultipleChoice",
-            "FlaxAlbertForPreTraining",
-            "FlaxAlbertForQuestionAnswering",
-            "FlaxAlbertForSequenceClassification",
-            "FlaxAlbertForTokenClassification",
-            "FlaxAlbertModel",
-            "FlaxAlbertPreTrainedModel",
-        ]
-    )
-    _import_structure["models.auto"].extend(
-        [
-            "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
-            "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
-            "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
-            "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
-            "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
-            "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
-            "FLAX_MODEL_FOR_PRETRAINING_MAPPING",
-            "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
-            "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
-            "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
-            "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
-            "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
-            "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
-            "FLAX_MODEL_MAPPING",
-            "FlaxAutoModel",
-            "FlaxAutoModelForCausalLM",
-            "FlaxAutoModelForImageClassification",
-            "FlaxAutoModelForMaskedLM",
-            "FlaxAutoModelForMultipleChoice",
-            "FlaxAutoModelForNextSentencePrediction",
-            "FlaxAutoModelForPreTraining",
-            "FlaxAutoModelForQuestionAnswering",
-            "FlaxAutoModelForSeq2SeqLM",
-            "FlaxAutoModelForSequenceClassification",
-            "FlaxAutoModelForSpeechSeq2Seq",
-            "FlaxAutoModelForTokenClassification",
-            "FlaxAutoModelForVision2Seq",
-        ]
-    )
-
-    # Flax models structure
-
-    _import_structure["models.bart"].extend(
-        [
-            "FlaxBartDecoderPreTrainedModel",
-            "FlaxBartForCausalLM",
-            "FlaxBartForConditionalGeneration",
-            "FlaxBartForQuestionAnswering",
-            "FlaxBartForSequenceClassification",
-            "FlaxBartModel",
-            "FlaxBartPreTrainedModel",
-        ]
-    )
-    _import_structure["models.beit"].extend(
-        [
-            "FlaxBeitForImageClassification",
-            "FlaxBeitForMaskedImageModeling",
-            "FlaxBeitModel",
-            "FlaxBeitPreTrainedModel",
-        ]
-    )
-
-    _import_structure["models.bert"].extend(
-        [
-            "FlaxBertForCausalLM",
-            "FlaxBertForMaskedLM",
-            "FlaxBertForMultipleChoice",
-            "FlaxBertForNextSentencePrediction",
-            "FlaxBertForPreTraining",
-            "FlaxBertForQuestionAnswering",
-            "FlaxBertForSequenceClassification",
-            "FlaxBertForTokenClassification",
-            "FlaxBertModel",
-            "FlaxBertPreTrainedModel",
-        ]
-    )
-    _import_structure["models.big_bird"].extend(
-        [
-            "FlaxBigBirdForCausalLM",
-            "FlaxBigBirdForMaskedLM",
-            "FlaxBigBirdForMultipleChoice",
-            "FlaxBigBirdForPreTraining",
-            "FlaxBigBirdForQuestionAnswering",
-            "FlaxBigBirdForSequenceClassification",
-            "FlaxBigBirdForTokenClassification",
-            "FlaxBigBirdModel",
-            "FlaxBigBirdPreTrainedModel",
-        ]
-    )
-    _import_structure["models.blenderbot"].extend(
-        [
-            "FlaxBlenderbotForConditionalGeneration",
-            "FlaxBlenderbotModel",
-            "FlaxBlenderbotPreTrainedModel",
-        ]
-    )
-    _import_structure["models.blenderbot_small"].extend(
-        [
-            "FlaxBlenderbotSmallForConditionalGeneration",
-            "FlaxBlenderbotSmallModel",
-            "FlaxBlenderbotSmallPreTrainedModel",
-        ]
-    )
-    _import_structure["models.bloom"].extend(
-        [
-            "FlaxBloomForCausalLM",
-            "FlaxBloomModel",
-            "FlaxBloomPreTrainedModel",
-        ]
-    )
-    _import_structure["models.clip"].extend(
-        [
-            "FlaxCLIPModel",
-            "FlaxCLIPPreTrainedModel",
-            "FlaxCLIPTextModel",
-            "FlaxCLIPTextPreTrainedModel",
-            "FlaxCLIPTextModelWithProjection",
-            "FlaxCLIPVisionModel",
-            "FlaxCLIPVisionPreTrainedModel",
-        ]
-    )
-    _import_structure["models.dinov2"].extend(
-        [
-            "FlaxDinov2Model",
-            "FlaxDinov2ForImageClassification",
-            "FlaxDinov2PreTrainedModel",
-        ]
-    )
-    _import_structure["models.distilbert"].extend(
-        [
-            "FlaxDistilBertForMaskedLM",
-            "FlaxDistilBertForMultipleChoice",
-            "FlaxDistilBertForQuestionAnswering",
-            "FlaxDistilBertForSequenceClassification",
-            "FlaxDistilBertForTokenClassification",
-            "FlaxDistilBertModel",
-            "FlaxDistilBertPreTrainedModel",
-        ]
-    )
-    _import_structure["models.electra"].extend(
-        [
-            "FlaxElectraForCausalLM",
-            "FlaxElectraForMaskedLM",
-            "FlaxElectraForMultipleChoice",
-            "FlaxElectraForPreTraining",
-            "FlaxElectraForQuestionAnswering",
-            "FlaxElectraForSequenceClassification",
-            "FlaxElectraForTokenClassification",
-            "FlaxElectraModel",
-            "FlaxElectraPreTrainedModel",
-        ]
-    )
-    _import_structure["models.encoder_decoder"].append("FlaxEncoderDecoderModel")
-    _import_structure["models.gpt2"].extend(["FlaxGPT2LMHeadModel", "FlaxGPT2Model", "FlaxGPT2PreTrainedModel"])
-    _import_structure["models.gpt_neo"].extend(
-        ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
-    )
-    _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"])
-    _import_structure["models.llama"].extend(["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"])
-    _import_structure["models.gemma"].extend(["FlaxGemmaForCausalLM", "FlaxGemmaModel", "FlaxGemmaPreTrainedModel"])
-    _import_structure["models.longt5"].extend(
-        [
-            "FlaxLongT5ForConditionalGeneration",
-            "FlaxLongT5Model",
-            "FlaxLongT5PreTrainedModel",
-        ]
-    )
-    _import_structure["models.marian"].extend(
-        [
-            "FlaxMarianModel",
-            "FlaxMarianMTModel",
-            "FlaxMarianPreTrainedModel",
-        ]
-    )
-    _import_structure["models.mbart"].extend(
-        [
-            "FlaxMBartForConditionalGeneration",
-            "FlaxMBartForQuestionAnswering",
-            "FlaxMBartForSequenceClassification",
-            "FlaxMBartModel",
-            "FlaxMBartPreTrainedModel",
-        ]
-    )
-    _import_structure["models.mistral"].extend(
-        [
-            "FlaxMistralForCausalLM",
-            "FlaxMistralModel",
-            "FlaxMistralPreTrainedModel",
-        ]
-    )
-    _import_structure["models.mt5"].extend(["FlaxMT5EncoderModel", "FlaxMT5ForConditionalGeneration", "FlaxMT5Model"])
-    _import_structure["models.opt"].extend(
-        [
-            "FlaxOPTForCausalLM",
-            "FlaxOPTModel",
-            "FlaxOPTPreTrainedModel",
-        ]
-    )
-    _import_structure["models.pegasus"].extend(
-        [
-            "FlaxPegasusForConditionalGeneration",
-            "FlaxPegasusModel",
-            "FlaxPegasusPreTrainedModel",
-        ]
-    )
-    _import_structure["models.regnet"].extend(
-        [
-            "FlaxRegNetForImageClassification",
-            "FlaxRegNetModel",
-            "FlaxRegNetPreTrainedModel",
-        ]
-    )
-    _import_structure["models.resnet"].extend(
-        [
-            "FlaxResNetForImageClassification",
-            "FlaxResNetModel",
-            "FlaxResNetPreTrainedModel",
-        ]
-    )
-    _import_structure["models.roberta"].extend(
-        [
-            "FlaxRobertaForCausalLM",
-            "FlaxRobertaForMaskedLM",
-            "FlaxRobertaForMultipleChoice",
-            "FlaxRobertaForQuestionAnswering",
-            "FlaxRobertaForSequenceClassification",
-            "FlaxRobertaForTokenClassification",
-            "FlaxRobertaModel",
-            "FlaxRobertaPreTrainedModel",
-        ]
-    )
-    _import_structure["models.roberta_prelayernorm"].extend(
-        [
-            "FlaxRobertaPreLayerNormForCausalLM",
-            "FlaxRobertaPreLayerNormForMaskedLM",
-            "FlaxRobertaPreLayerNormForMultipleChoice",
-            "FlaxRobertaPreLayerNormForQuestionAnswering",
-            "FlaxRobertaPreLayerNormForSequenceClassification",
-            "FlaxRobertaPreLayerNormForTokenClassification",
-            "FlaxRobertaPreLayerNormModel",
-            "FlaxRobertaPreLayerNormPreTrainedModel",
-        ]
-    )
-    _import_structure["models.roformer"].extend(
-        [
-            "FlaxRoFormerForMaskedLM",
-            "FlaxRoFormerForMultipleChoice",
-            "FlaxRoFormerForQuestionAnswering",
-            "FlaxRoFormerForSequenceClassification",
-            "FlaxRoFormerForTokenClassification",
-            "FlaxRoFormerModel",
-            "FlaxRoFormerPreTrainedModel",
-        ]
-    )
-    _import_structure["models.speech_encoder_decoder"].append("FlaxSpeechEncoderDecoderModel")
-    _import_structure["models.t5"].extend(
-        [
-            "FlaxT5EncoderModel",
-            "FlaxT5ForConditionalGeneration",
-            "FlaxT5Model",
-            "FlaxT5PreTrainedModel",
-        ]
-    )
-    _import_structure["models.vision_encoder_decoder"].append("FlaxVisionEncoderDecoderModel")
-    _import_structure["models.vision_text_dual_encoder"].extend(["FlaxVisionTextDualEncoderModel"])
-    _import_structure["models.vit"].extend(["FlaxViTForImageClassification", "FlaxViTModel", "FlaxViTPreTrainedModel"])
-    _import_structure["models.wav2vec2"].extend(
-        [
-            "FlaxWav2Vec2ForCTC",
-            "FlaxWav2Vec2ForPreTraining",
-            "FlaxWav2Vec2Model",
-            "FlaxWav2Vec2PreTrainedModel",
-        ]
-    )
-    _import_structure["models.whisper"].extend(
-        [
-            "FlaxWhisperForConditionalGeneration",
-            "FlaxWhisperModel",
-            "FlaxWhisperPreTrainedModel",
-            "FlaxWhisperForAudioClassification",
-        ]
-    )
-    _import_structure["models.xglm"].extend(
-        [
-            "FlaxXGLMForCausalLM",
-            "FlaxXGLMModel",
-            "FlaxXGLMPreTrainedModel",
-        ]
-    )
-    _import_structure["models.xlm_roberta"].extend(
-        [
-            "FlaxXLMRobertaForMaskedLM",
-            "FlaxXLMRobertaForMultipleChoice",
-            "FlaxXLMRobertaForQuestionAnswering",
-            "FlaxXLMRobertaForSequenceClassification",
-            "FlaxXLMRobertaForTokenClassification",
-            "FlaxXLMRobertaModel",
-            "FlaxXLMRobertaForCausalLM",
-            "FlaxXLMRobertaPreTrainedModel",
-        ]
-    )
+from . import ms_utils
 
 sys.modules[__name__] = _LazyModule(
     'transformers',
@@ -4446,3 +4110,4 @@
     module_spec=__spec__,
     extra_objects={"__version__": transformers.__version__},
 )
+
diff --git a/mindnlp/transformers/ms_utils.py b/mindnlp/transformers/ms_utils.py
new file mode 100644
index 000000000..592c366c1
--- /dev/null
+++ b/mindnlp/transformers/ms_utils.py
@@ -0,0 +1,258 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""MindNLP MindSpore Utils"""
+
+import inspect
+from typing import Union, Optional, List, Tuple
+
+import mindspore
+from mindspore.common.initializer import initializer, Normal
+
+from mindnlp.core import nn, ops
+from mindnlp.core.nn import Parameter
+
+ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
+
+class Conv1D(nn.Module):
+    """
+    1D-convolutional layer Basically works like a linear layer but the weights are transposed.
+
+    Args:
+        n_out (`int`): The number of output features.
+        n_in (`int`): The number of input features.
+    """
+    def __init__(self, n_out, n_in):
+        """
+        Initialize the Conv1D class with the specified number of output channels and input channels.
+        
+        Args:
+            self (object): The instance of the Conv1D class.
+            n_out (int): The number of output channels for the convolution operation.
+            n_in (int): The number of input channels for the convolution operation.
+        
+        Returns:
+            None.
+        
+        Raises:
+            None.
+        """
+        super().__init__()
+        self.n_out = n_out
+        self.weight = nn.Parameter(ops.empty(nx, nf))
+        self.bias = nn.Parameter(ops.zeros(nf))
+
+    def forward(self, x):
+        """
+        Constructs the 1D convolutional operation on the input tensor x.
+        
+        Args:
+            self (Conv1D): An instance of the Conv1D class.
+            x (mindspore.Tensor): The input tensor on which the convolution operation is applied. 
+                Should have a shape of (batch_size, sequence_length, input_channels).
+                
+        Returns:
+            None: The method modifies the input tensor x in place by performing the convolution operation.
+        
+        Raises:
+            ValueError: If the shape of the input tensor x is not as expected for a 1D convolution operation.
+            RuntimeError: If there are any runtime issues during the convolution operation.
+        """
+        size_out = x.shape[:-1] + (self.n_out,)
+        x = ops.matmul(x.view(-1, x.shape[-1]), self.weight) + self.bias
+        x = x.view(size_out)
+        return x
+
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """
+    Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
+    are transposed.
+
+    Used to remove heads.
+
+    Args:
+        layer ([`~mindspore_utils.Conv1D`]): The layer to prune.
+        index (`mindspore.Tensor[int64]`): The indices to keep in the layer.
+        axis (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.
+
+    Returns:
+        [`~mindspore_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
+    """
+    gama_l = layer.weight.index_select(dim, index)
+    if dim == 0:
+        beta_l = layer.bias
+    else:
+        beta_l = layer.bias[index]
+    new_size = list(layer.weight.shape)
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0])
+    new_layer.weight.requires_grad = False
+    new_layer.weight.assign_value(gama_l)
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.assign_value(beta_l)
+    new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads):
+    """
+    Finds the heads and their indices taking `already_pruned_heads` into account.
+
+    Args:
+        heads (`List[int]`): List of the indices of heads to prune.
+        n_heads (`int`): The number of heads in the model.
+        head_size (`int`): The size of each head.
+        already_pruned_heads (`Set[int]`): A set of already pruned heads.
+
+    Returns:
+        `Tuple[Set[int], MindSpore.Tensor[int64]]`: A tuple with the remaining heads and their corresponding indices.
+    """
+    mask = ops.ones((n_heads, head_size))
+    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
+    for head in heads:
+        # Compute how many pruned heads are before the head and move the index accordingly
+        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
+        mask[head] = 0
+    mask = mask.view(-1).eq(1)
+    index = ops.arange(len(mask), dtype=mindspore.int64)[mask]
+    return heads, index
+
+def prune_linear_layer(layer, index, dim=0):
+    """
+    Prune a linear layer to keep only entries in index.
+    Used to remove heads.
+
+    Args:
+        layer (`mindspore.nn.Linear`): The layer to prune.
+        index (`mindspore.Tensor[int64]`): The indices to keep in the layer.
+        axis (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
+
+    Returns:
+        `mindspore.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
+    """
+    W = layer.weight.index_select(dim, index).copy()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.copy()
+        else:
+            b = layer.bias[index].copy()
+    new_size = list(layer.weight.shape)
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.assign_value(W)
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.assign_value(b)
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def apply_chunking_to_forward(forward_fn, chunk_size, chunk_axis, *input_tensors):
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
+    `chunk_axis`. It then applies a layer `forward_fn` to each chunk independently to save memory.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
+    applying `forward_fn` to `input_tensors`.
+
+    Args:
+        forward_fn (`Callable[..., mindspore.Tensor]`):
+            The forward function of the model.
+        chunk_size (`int`):
+            The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
+        chunk_axis (`int`):
+            The dimension over which the `input_tensors` should be chunked.
+        input_tensors (`Tuple[mindspore.Tensor]`):
+            The input tensors of `forward_fn` which will be chunked
+
+    Returns:
+        `mindspore.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
+    """
+    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+
+     # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    if num_args_in_forward_chunk_fn != len(input_tensors):
+        raise ValueError(
+            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+            "tensors are given"
+        )
+
+    if chunk_size > 0:
+        tensor_shape = input_tensors[0].shape[chunk_axis]
+        for input_tensor in input_tensors:
+            if input_tensor.shape[chunk_axis] != tensor_shape:
+                raise ValueError(
+                    f"All input tenors have to be of the same shape: {tensor_shape}, "
+                    f"found shape {input_tensor.shape[chunk_axis]}"
+                )
+
+        if input_tensors[0].shape[chunk_axis] % chunk_size != 0:
+            raise ValueError(
+                f"The dimension to be chunked {input_tensors[0].shape[chunk_axis]} has to be a multiple of the chunk "
+                f"size {chunk_size}"
+            )
+
+        num_chunks = input_tensors[0].shape[chunk_axis] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(ops.chunk(input_tensor, num_chunks, dim=chunk_axis) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return ops.cat(output_chunks, dim=chunk_axis)
+
+    return forward_fn(*input_tensors)
+
+def zero_init(cls, *args, **kwargs):
+    """init zeros to speed up initialize stage."""
+    for k in kwargs.keys():
+        if 'init' in k:
+            kwargs.pop(k)
+    init_signature = inspect.signature(cls.__init__)
+    init_params = init_signature.parameters
+    for param_name in init_params.keys():
+        if 'init' in param_name:
+            kwargs[param_name] = 'zeros'
+    def _reset_parameters(self):
+        pass
+    cls.reset_parameters = _reset_parameters
+    return cls(*args, **kwargs)
+
+def meshgrid(
+    *tensors: Union[mindspore.Tensor, List[mindspore.Tensor]], indexing: Optional[str] = None
+) -> Tuple[mindspore.Tensor, ...]:
+    """
+    Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument.
+
+    Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
+    """
+    return ops.meshgrid(*tensors, indexing=indexing)
+
+def isin_friendly(elements: mindspore.Tensor, test_elements: mindspore.Tensor) -> mindspore.Tensor:
+    """
+    Same as `ops.isin` without flags, but MPS-friendly.
+
+    Args:
+        elements (`mindspore.Tensor`): Input elements
+        test_elements (`mindspore.Tensor`): The elements to check against.
+
+    Returns:
+        `mindspore.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements`
+        and False otherwise
+    """
+    return elements.tile((test_elements.shape[0], 1)).eq(test_elements.unsqueeze(1)).sum(0).bool().squeeze()
\ No newline at end of file
diff --git a/mindnlp/trl/__init__.py b/mindnlp/trl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/mindnlp/utils/__init__.py b/mindnlp/utils/__init__.py
index e69de29bb..7fca6fc2b 100644
--- a/mindnlp/utils/__init__.py
+++ b/mindnlp/utils/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Common utils
+"""
+from .generic import *
+# from .decompress import unzip, untar, ungz
+# from .download import *
+# from .compatibility import *
+# from .chat_template_utils import *
+from .import_utils import requires_backends, is_mindspore_available, OptionalDependencyNotAvailable, is_sentencepiece_available, \
+is_tokenizers_available, direct_transformers_import, is_protobuf_available, is_safetensors_available, \
+is_cython_available, is_pretty_midi_available, is_essentia_available, is_librosa_available, is_scipy_available, is_pyctcdecode_available, \
+is_jieba_available, is_vision_available, is_sudachi_projection_available, is_g2p_en_available, is_levenshtein_available, is_nltk_available, \
+is_bs4_available, is_pytesseract_available, is_tiktoken_available, is_einops_available, is_faiss_available, is_datasets_available, \
+is_sacremoses_available, is_phonemizer_available,is_speech_available, is_kenlm_available, is_triton_available
+
+# from .testing_utils import require_mindspore
+# from .save import convert_file_size_to_int
+# from .peft_utils import find_adapter_config_file
+
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+SENTENCEPIECE_UNDERLINE = "▁"
\ No newline at end of file
diff --git a/mindnlp/utils/generic.py b/mindnlp/utils/generic.py
new file mode 100644
index 000000000..2de3a7a36
--- /dev/null
+++ b/mindnlp/utils/generic.py
@@ -0,0 +1,612 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Generic utils.
+"""
+import inspect
+from enum import Enum
+from collections import OrderedDict, UserDict
+from dataclasses import fields
+from typing import Any, Tuple, ContextManager, List, TypedDict, Optional
+from contextlib import ExitStack
+from functools import wraps
+import numpy as np
+import mindspore
+from .import_utils import is_mindspore_available
+
+
+def is_tensor(x):
+    """
+    Tests if `x` is a `mindspore.Tensor` or `np.ndarray`.
+    """
+    if isinstance(x, mindspore.Tensor):
+        return True
+
+    return isinstance(x, np.ndarray)
+
+def _is_mindspore(x):
+    """
+    Checks if the input x is a MindSpore tensor.
+    
+    Args:
+        x (object): The input object to be checked.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return isinstance(x, mindspore.Tensor)
+
+
+def is_mindspore_tensor(x):
+    """
+    Tests if `x` is a torch tensor or not. Safe to call even if torch is not installed.
+    """
+    return False if not is_mindspore_available() else _is_mindspore(x)
+def set_attribute_for_modules(module, key: str, value: Any):
+    """
+    Set a value to a module and all submodules.
+    """
+    setattr(module, key, value)
+    for submodule in module.children():
+        set_attribute_for_modules(submodule, key, value)
+        
+def can_return_tuple(func):
+    """
+    Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
+    use_return_dict=False is set in the config.
+
+    Note:
+        output.to_tuple() convert output to tuple skipping all `None` values.
+    """
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        is_requested_to_return_tuple = kwargs.pop("return_dict", True) is False
+        is_configured_to_return_tuple = self.config.use_return_dict is False if hasattr(self, "config") else False
+
+        # The following allows to convert output to tuple ONLY on top level forward call,
+        # while internal modules of the model will return Output objects
+        # to be able to use name-based attribute access in modeling code.
+
+        # We will check if we are on top level module, if so, turn off to tuple conversion for all
+        # underling calls.
+        is_top_level_module = getattr(self, "_is_top_level_module", True)
+        if is_configured_to_return_tuple and is_top_level_module:
+            set_attribute_for_modules(self, "_is_top_level_module", False)
+
+        try:
+            output = func(self, *args, **kwargs)
+            if is_requested_to_return_tuple or (is_configured_to_return_tuple and is_top_level_module):
+                output = output.to_tuple()
+        finally:
+            # Remove the flag after the model forward call is finished.
+            if is_configured_to_return_tuple and is_top_level_module:
+                del_attribute_from_modules(self, "_is_top_level_module")
+
+        return output
+
+    return wrapper
+class ExplicitEnum(str, Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+    @classmethod
+    def _missing_(cls, value):
+        """
+        This method `_missing_` in the class `ExplicitEnum` is a class method used to handle missing values in the ExplicitEnum class.
+        
+        Args:
+            cls (class): The class itself, used for referring to the class instance inside the method.
+            value (any): The value that was not found in the ExplicitEnum class.
+            
+        Returns:
+            None: This method does not return any value as it raises an exception when called.
+        
+        Raises:
+            ValueError: If the value provided is not a valid member of the Enum class, a ValueError is raised with a message listing the valid options to choose from.
+        """
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
+    tab-completion in an IDE.
+    """
+    MINDSPORE = "ms"
+    NUMPY = "np"
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+    IDE.
+    """
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+class LossKwargs(TypedDict, total=False):
+    """
+    Keyword arguments to be passed to the loss function
+
+    Attributes:
+        num_items_in_batch (`int`, *optional*):
+            Number of items in the batch. It is recommended to pass it when
+            you are doing gradient accumulation.
+    """
+
+    num_items_in_batch: Optional[int]
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
+    before.
+
+    </Tip>
+    """
+
+    def __post_init__(self):
+        """Perform post-initialization actions for the ModelOutput class.
+        
+        This method is automatically called after the initialization of a ModelOutput object.
+        
+        Args:
+            self: An instance of the ModelOutput class.
+        
+        Returns:
+            None
+        
+        Raises:
+            ValueError: If the ModelOutput object has no fields or more than one required field.
+            ValueError: If a key/value pair in the first field is not a tuple or if it does not follow the format (key, value).
+            ValueError: If the key/value pair cannot be set for a given element in the first field.
+        """
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if len(class_fields) == 0:
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+        if not all(field.default is None for field in class_fields[1:]):
+            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for idx, element in enumerate(iterator):
+                    if (
+                            not isinstance(element, (list, tuple))
+                            or not len(element) == 2
+                            or not isinstance(element[0], str)
+                    ):
+                        if idx == 0:
+                            # If we do not have an iterator of key/values, set it as attribute
+                            self[class_fields[0].name] = first_field
+                        else:
+                            # If we have a mixed iterator, raise an error
+                            raise ValueError(
+                                f"Cannot set key/value for {element}. It needs to be a tuple (key, value)."
+                            )
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        """
+        __delitem__
+        
+        Deletes an item from the ModelOutput instance.
+        
+        Args:
+            self (ModelOutput): The ModelOutput instance from which the item will be deleted.
+        
+        Returns:
+            None. This method does not return a value.
+        
+        Raises:
+            RuntimeError: If the '__delitem__' method is attempted to be used on a ModelOutput instance, a RuntimeError is raised with a message indicating that this method cannot be used on the instance.
+        """
+        raise RuntimeError(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        """
+        Sets a default value in the ModelOutput instance.
+        
+        Args:
+            self: The ModelOutput instance itself.
+        
+        Returns:
+            None. This method does not return any value.
+        
+        Raises:
+            RuntimeError: This exception is raised if the method 'setdefault' is called on a ModelOutput instance. The message in the exception states that the 'setdefault' method cannot be used on a
+ModelOutput instance.
+        
+        Note:
+            The 'setdefault' method is not supported for ModelOutput instances as it can only be used on dictionary objects.
+        """
+        raise RuntimeError(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        """
+        Method that raises a RuntimeError to prevent the use of 'pop' on a ModelOutput instance.
+        
+        Args:
+            self (object): The ModelOutput instance on which 'pop' is being called. 
+                           This parameter is required and represents the current instance of the class.
+        
+        Returns:
+            None. This method does not return any value.
+        
+        Raises:
+            RuntimeError: Raised when attempting to use 'pop' method on a ModelOutput instance. The exception message
+                          specifies that 'pop' cannot be used on a ModelOutput instance to prevent unintended behavior.
+        """
+        raise RuntimeError(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        """
+        Updates the current instance of the ModelOutput class.
+        
+        Args:
+            self (ModelOutput): The instance of the ModelOutput class.
+        
+        Returns:
+            None: This method does not return any value.
+        
+        Raises:
+            RuntimeError: If the method is called on an instance of the ModelOutput class. This is to prevent using the 'update' method on a ModelOutput instance, as it is not allowed.
+        
+        Note:
+            The 'update' method is not allowed to be used on a ModelOutput instance. If called, it will raise a RuntimeError.
+        """
+        raise RuntimeError(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        """
+        This method allows accessing the elements of the ModelOutput object using the square bracket notation.
+        
+        Args:
+            self (ModelOutput): The instance of the ModelOutput class.
+            k (str or int): The key or index for accessing the element. If k is a string, it is used as a key to retrieve the corresponding value. If k is an integer, it is used as an index to retrieve the
+element. 
+        
+        Returns:
+            None: This method does not return any value directly. The retrieved value is returned based on the input key or index.
+        
+        Raises:
+            TypeError: If the input parameter k is not a string or an integer.
+            KeyError: If the input key k is not found in the internal dictionary when k is a string.
+            IndexError: If the input index k is out of range when k is an integer.
+        """
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        """
+        Method __setattr__ in the class ModelOutput sets the value for the specified attribute name.
+        
+        Args:
+            self (object): The instance of the ModelOutput class.
+            name (str): The name of the attribute to be set.
+            value (any): The value to be assigned to the attribute. It can be of any type.
+        
+        Returns:
+            None. This method does not return any value.
+        
+        Raises:
+            No specific exceptions are raised by this method. However, if the attribute name is not in the keys of the object, it will be added as a new attribute. If the value is None, the attribute will be
+set to None.
+        """
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        """
+        This method '__setitem__' in the class 'ModelOutput' allows setting key-value pairs in the model output object.
+        
+        Args:
+            self (ModelOutput): The instance of the ModelOutput class.
+            key (Any): The key to be set in the model output object.
+            value (Any): The value corresponding to the key to be set in the model output object.
+        
+        Returns:
+            None. This method does not return any value explicitly.
+        
+        Raises:
+            This method may raise the following exceptions:
+            - TypeError: If the key is not of a valid type.
+            - ValueError: If the value is not acceptable for the given key.
+            - Other exceptions related to the internal implementation of the ModelOutput class.
+        """
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(v for _, v in self.items())
+
+# vendored from distutils.util
+def strtobool(val):
+    """Convert a string representation of truth to true (1) or false (0).
+
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+    Raises ValueError if 'val' is anything else.
+    """
+    val = val.lower()
+    if val in {"y", "yes", "t", "true", "on", "1"}:
+        return 1
+    if val in {"n", "no", "f", "false", "off", "0"}:
+        return 0
+    raise ValueError(f"invalid truth value {val!r}")
+
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+
+    From tensorflow_datasets
+
+    Built-in in functools from Python 3.8.
+    """
+    def __get__(self, obj, objtype=None):
+        """ 
+        Method '__get__' in the class 'cached_property'.
+        
+        Args:
+            self (object): The current instance of the class.
+            obj (object): The object on which the method is being called.
+            objtype (object): The type of the object, if available. Defaults to None.
+        
+        Returns:
+            None: The method returns a value of type None.
+        
+        Raises:
+            AttributeError: If the attribute is unreadable, this exception is raised.
+        """
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+
+def _is_numpy(x):
+    """
+    This function checks if the input is a NumPy array.
+    
+    Args:
+        x (any): The input to be checked for being a NumPy array.
+    
+    Returns:
+        None: This function does not return a value.
+    
+    Raises:
+        None
+    """
+    return isinstance(x, np.ndarray)
+
+
+def is_numpy_array(x):
+    """
+    Tests if `x` is a numpy array or not.
+    """
+    return _is_numpy(x)
+
+def infer_framework_from_repr(x):
+    """
+    Tries to guess the framework of an object `x` from its repr (brittle but will help in `is_tensor` to try the
+    frameworks in a smart order, without the need to import the frameworks).
+    """
+    representation = str(type(x))
+    if representation.startswith("<class 'mindspore."):
+        return "ms"
+    if representation.startswith("<class 'numpy."):
+        return "np"
+
+def _get_frameworks_and_test_func(x):
+    """
+    Returns an (ordered since we are in Python 3.7+) dictionary framework to test function, which places the framework
+    we can guess from the repr first, then Numpy, then the others.
+    """
+    framework_to_test = {
+        "ms": is_mindspore_tensor,
+        "np": is_numpy_array,
+    }
+    preferred_framework = infer_framework_from_repr(x)
+    # We will test this one first, then numpy, then the others.
+    frameworks = [] if preferred_framework is None else [preferred_framework]
+    if preferred_framework != "np":
+        frameworks.append("np")
+    frameworks.extend([f for f in framework_to_test if f not in [preferred_framework, "np"]])
+    return {f: framework_to_test[f] for f in frameworks}
+
+
+def to_py_obj(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
+    """
+    framework_to_py_obj = {
+        "ms": lambda obj: obj.asnumpy().tolist(),
+        "np": lambda obj: obj.tolist(),
+    }
+
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_py_obj[framework](obj)
+
+    # tolist also works on 0d np arrays
+    if isinstance(obj, np.number):
+        return obj.tolist()
+    return obj
+
+def to_numpy(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
+    """
+    framework_to_numpy = {
+        "ms": lambda obj: obj.asnumpy(),
+        "np": lambda obj: obj,
+    }
+
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_numpy(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return np.array(obj)
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_numpy[framework](obj)
+
+    return obj
+
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
+    in the `fastcore` library.
+    """
+    def __init__(self, context_managers: List[ContextManager]):
+        """
+        __init__
+        
+        Args:
+            self: The instance of the class.
+            context_managers (List[ContextManager]): A list of context managers to be initialized.
+        
+        Returns:
+            None: This method does not return any value.
+        
+        Raises:
+            No specific exceptions are raised by this method.
+        """
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+
+    def __enter__(self):
+        """
+        Method '__enter__' in the class 'ContextManagers'.
+        
+        Args:
+            self (object): The instance of the ContextManagers class on which the method is called. It is used to access the instance attributes and methods.
+            
+        Returns:
+            None. This method does not return any value explicitly, it performs context management operations within the class.
+            
+        Raises:
+            This method may raise exceptions if the context managers encountered during the iteration in the for loop raise any exceptions. Ensure proper error handling is in place to catch and handle any
+exceptions that may occur during the context management operations.
+        """
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+
+    def __exit__(self, *args, **kwargs):
+        """
+        __exit__
+        
+        Method in the class ContextManagers.
+        
+        Args:
+            self: (object) The instance of the class.
+        
+        Returns:
+            None: This method does not return any value.
+        
+        Raises:
+            This method does not explicitly raise any exceptions.
+        """
+        self.stack.__exit__(*args, **kwargs)
+
+def find_labels(model_class):
+    """
+    Find the labels used by a given model.
+
+    Args:
+        model_class (`type`): The class of the model.
+    """
+    model_name = model_class.__name__
+    signature = inspect.signature(model_class.forward)  # TensorFlow models
+
+    if "QuestionAnswering" in model_name:
+        return [p for p in signature.parameters if "label" in p or p in ("start_positions", "end_positions")]
+    else:
+        return [p for p in signature.parameters if "label" in p]
+
+def can_return_loss(model_class):
+    """
+    Check if a given model can return loss.
+
+    Args:
+        model_class (`type`): The class of the model.
+    """
+    signature = inspect.signature(model_class.forward)  # TensorFlow models
+
+    for p in signature.parameters:
+        if p == "return_loss" and signature.parameters[p].default is True:
+            return True
+
+    return False
\ No newline at end of file
diff --git a/mindnlp/utils/import_utils.py b/mindnlp/utils/import_utils.py
new file mode 100644
index 000000000..1a8cecae0
--- /dev/null
+++ b/mindnlp/utils/import_utils.py
@@ -0,0 +1,736 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import os
+import sys
+import warnings
+from types import ModuleType
+from collections import OrderedDict
+from functools import wraps, lru_cache
+from typing import Tuple, Union
+import importlib.util
+from packaging import version
+
+from . import logging
+
+if sys.version_info >= (3, 8):
+    # For Python 3.8 and later
+    from importlib import metadata as importlib_metadata
+else:
+    # For Python versions earlier than 3.8
+    import importlib_metadata
+
+
+logger = logging.get_logger(__name__)
+
+def _is_package_available(
+        pkg_name: str, return_version: bool = False
+) -> Union[Tuple[bool, str], bool]:
+    """
+    Checks if a specified package is available and optionally returns its version.
+    
+    Args:
+        pkg_name (str): The name of the package to check for availability.
+        return_version (bool, optional): Indicates whether to return the package version along with availability status. Defaults to False.
+    
+    Returns:
+        Union[Tuple[bool, str], bool]: If return_version is True, returns a tuple containing a boolean indicating package availability and a string representing the package version. 
+        If return_version is False, returns a boolean indicating package availability.
+    
+    Raises:
+        No specific exceptions are raised within this function.
+    """
+    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            package_version = importlib_metadata.version(pkg_name)
+            package_exists = True
+        except importlib_metadata.PackageNotFoundError:
+            package_exists = False
+        logger.debug(f"Detected {pkg_name} version {package_version}")
+    if return_version:
+        return package_exists, package_version
+    return package_exists
+
+
+_ftfy_available = _is_package_available("ftfy")
+_einops_available = _is_package_available('einops')
+_tiktoken_available = _is_package_available('tiktoken')
+_bs4_available = importlib.util.find_spec("bs4") is not None
+_pytest_available = _is_package_available("pytest")
+_datasets_available = _is_package_available("datasets")
+_sentencepiece_available = _is_package_available("sentencepiece")
+_soundfile_available = _is_package_available("soundfile")
+_tokenizers_available = _is_package_available("tokenizers")
+_pyctcdecode_available = _is_package_available("pyctcdecode")
+_safetensors_available = _is_package_available("safetensors")
+_modelscope_available = _is_package_available("modelscope")
+_jieba_available = _is_package_available("jieba")
+_pytesseract_available = _is_package_available("pytesseract")
+_g2p_en_available = _is_package_available("g2p_en")
+_phonemizer_available = _is_package_available("phonemizer")
+_mindspore_version, _mindspore_available = _is_package_available(
+    "mindspore", return_version=True
+)
+_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
+
+_librosa_available = _is_package_available("librosa")
+_scipy_available = _is_package_available("scipy")
+_triton_available = _is_package_available("triton")
+_sacremoses_available = _is_package_available("sacremoses")
+_torchaudio_available = _is_package_available("pykaldi")
+_kenlm_available = _is_package_available("kenlm")
+_datamodel_code_generator_availabel = _is_package_available('datamodel_code_generator')
+_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None
+try:
+    _pretty_midi_version = importlib_metadata.version("pretty_midi")
+    logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}")
+except importlib_metadata.PackageNotFoundError:
+    _pretty_midi_available = False
+
+_essentia_available = importlib.util.find_spec("essentia") is not None
+try:
+    _essentia_version = importlib_metadata.version("essentia")
+    logger.debug(f"Successfully imported essentia version {_essentia_version}")
+except importlib_metadata.PackageNotFoundError:
+    _essentia_version = False
+
+_levenshtein_available = _is_package_available("Levenshtein")
+_nltk_available = _is_package_available("nltk")
+
+
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib.metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib.metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib.metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib.metadata.PackageNotFoundError:
+        _faiss_available = False
+
+def is_triton_available():
+    return _triton_available
+
+def is_datamodel_code_generator_availabel():
+    return _datamodel_code_generator_availabel
+
+def is_faiss_available():
+    return _faiss_available
+
+def is_levenshtein_available():
+    return _levenshtein_available
+
+
+def is_nltk_available():
+    return _nltk_available
+
+
+def is_einops_available():
+    return _einops_available
+
+
+def is_sudachi_available():
+    """
+    Checks if SudachiPy is available for use.
+    
+    Returns:
+        None: Indicates whether SudachiPy is available or not.
+    
+    """
+    return _sudachipy_available
+
+
+def get_sudachi_version():
+    '''
+    Returns the version of SudachiPy.
+    
+    Returns:
+        None: This function does not take any parameters.
+    
+    Raises:
+        None
+    '''
+    return _sudachipy_version
+
+
+def is_bs4_available():
+    return _bs4_available
+
+def is_sudachi_projection_available():
+    """
+    Checks if Sudachi projection is available.
+    
+    This function checks if Sudachi is available and if the Sudachi version is equal to or greater than 0.6.8.
+    
+    Returns:
+        None
+    
+    Raises:
+        None
+    """
+    if not is_sudachi_available():
+        return False
+
+    # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the forwardor of BertJapaneseTokenizer.
+    # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
+    return version.parse(_sudachipy_version) >= version.parse("0.6.8")
+
+def is_sacremoses_available():
+    """
+    Checks if the sacremoses library is available in the current environment.
+    
+    Returns:
+        None: Indicates whether the sacremoses library is available or not.
+    
+    Raises:
+        None.
+    """
+    return _sacremoses_available
+
+
+def is_mindspore_available():
+    '''
+    Checks if MindSpore is available.
+    
+    Args:
+        None
+    
+    Returns:
+        None: Indicates that the function does not return any value.
+    
+    Raises:
+        None: No exceptions are raised by this function.
+    '''
+    return _mindspore_available
+
+
+def get_mindspore_version():
+    """
+    Returns the current version of MindSpore.
+    
+    Args:
+    
+    Returns:
+        None: This function does not take any parameters.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return _mindspore_version
+
+
+
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_datasets_available():
+    """
+    Checks if datasets are available.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return _datasets_available
+
+
+def is_sentencepiece_available():
+    """
+    Checks if SentencePiece library is available.
+    
+    Returns:
+        None: Indicates whether the SentencePiece library is available or not.
+    
+    Raises:
+        None.
+    """
+    return _sentencepiece_available
+
+
+def is_tokenizers_available():
+    """Check if tokenizers are available.
+    
+    This function checks if tokenizers are available for use. It does not take any parameters.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return _tokenizers_available
+
+
+def is_safetensors_available():
+    """
+    Checks if SafeTensors is available in the current environment.
+    
+    Returns:
+        None: Indicates whether SafeTensors is available or not.
+    
+    """
+    return _safetensors_available
+
+
+def is_modelscope_available():
+    '''
+    Checks if the model scope is available.
+    
+    Returns:
+        None: Indicates whether the model scope is available or not.
+    '''
+    return _modelscope_available
+
+
+def is_cython_available():
+    """
+    Checks if Cython is available in the current environment.
+    
+    Returns:
+        None: Indicates whether Cython is available or not.
+    
+    Raises:
+        None
+    """
+    return importlib.util.find_spec("pyximport") is not None
+
+
+def is_protobuf_available():
+    """
+    Checks if the Google Protocol Buffers (protobuf) library is available.
+    
+    Returns:
+        bool: True if the protobuf library is available, False otherwise.
+    
+    Raises:
+        No specific exceptions are raised by this function.
+    """
+    if importlib.util.find_spec("google") is None:
+        return False
+    return importlib.util.find_spec("google.protobuf") is not None
+
+
+def is_pytest_available():
+    """
+    Check if the pytest library is available.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    """
+    return _pytest_available
+
+
+def is_pretty_midi_available():
+    """
+    Checks if the 'pretty_midi' library is available.
+    
+    Returns:
+        None
+    
+    Raises:
+        None
+    """
+    return _pretty_midi_available
+
+
+def is_librosa_available():
+    """
+    Checks if the 'librosa' library is available.
+    
+    Returns:
+        None
+    
+    Raises:
+        None
+    """
+    return _librosa_available
+
+
+def is_essentia_available():
+    """
+    Checks if the 'essentia' library is available.
+    
+    Returns:
+        None.
+    
+    Raises:
+        None.
+    """
+    return _essentia_available
+
+
+def is_pyctcdecode_available():
+    """
+    Check if the PyCTCDecode library is available.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None
+    """
+    return _pyctcdecode_available
+
+
+def is_scipy_available():
+    """
+    Checks if the SciPy library is available.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return _scipy_available
+
+
+def is_jieba_available():
+    ''' 
+    Checks if the Jieba library is available.
+    
+    Returns:
+        None: The function does not return any value.
+    
+    '''
+    return _jieba_available
+
+
+def is_pytesseract_available():
+    """
+    Check if pytesseract library is available.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    """
+    return _pytesseract_available
+
+
+def is_g2p_en_available():
+    return _g2p_en_available
+
+
+def is_tiktoken_available():
+    return _tiktoken_available
+
+
+def is_phonemizer_available():
+    return _phonemizer_available
+
+
+@lru_cache()
+def is_vision_available():
+    """
+    Checks if the Pillow library is available for image processing.
+    
+    Returns:
+        bool: True if Pillow library is available, False otherwise.
+    
+    Raises:
+        PackageNotFoundError: If Pillow or Pillow-SIMD package is not found.
+    """
+    _pil_available = importlib.util.find_spec("PIL") is not None
+    if _pil_available:
+        try:
+            package_version = importlib_metadata.version("Pillow")
+        except importlib_metadata.PackageNotFoundError:
+            try:
+                package_version = importlib_metadata.version("Pillow-SIMD")
+            except importlib_metadata.PackageNotFoundError:
+                return False
+        logger.debug(f"Detected PIL version {package_version}")
+    return _pil_available
+
+
+def is_in_notebook():
+    """
+    This function checks if the code is running in a Jupyter notebook environment by examining the current execution environment and relevant environment variables.
+    
+    Returns:
+        bool: Returns True if the code is running in a Jupyter notebook environment, otherwise False.
+    
+    Raises:
+        AttributeError: If an attribute error occurs during the execution of the function.
+        ImportError: If the code is running in the console, VS Code, or Databricks environment, respective ImportError with the environment name is raised.
+        KeyError: If a key error occurs during the execution of the function.
+    """
+    try:
+        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
+        get_ipython = sys.modules["IPython"].get_ipython
+        if "IPKernelApp" not in get_ipython().config:
+            raise ImportError("console")
+        if "VSCODE_PID" in os.environ:
+            raise ImportError("vscode")
+        if (
+                "DATABRICKS_RUNTIME_VERSION" in os.environ
+                and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0"
+        ):
+            # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
+            # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
+            raise ImportError("databricks")
+
+        return importlib.util.find_spec("IPython") is not None
+    except (AttributeError, ImportError, KeyError):
+        return False
+
+
+# docstyle-ignore
+CYTHON_IMPORT_ERROR = """
+{0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
+Cython`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+DATASETS_IMPORT_ERROR = """
+{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
+```
+pip install datasets
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install datasets
+```
+then restarting your kernel.
+
+Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
+working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
+that python file if that's the case. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TOKENIZERS_IMPORT_ERROR = """
+{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
+```
+pip install tokenizers
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install tokenizers
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+MINDSPORE_IMPORT_ERROR = """
+{0} requires the MindSpore library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.mindspore.cn/install/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+LIBROSA_IMPORT_ERROR = """
+{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
+`pip install librosa`
+Please note that you may need to restart your runtime after installation.
+"""
+
+ESSENTIA_IMPORT_ERROR = """
+{0} requires essentia library. But that was not found in your environment. You can install them with pip:
+`pip install essentia==2.1b6.dev1034`
+Please note that you may need to restart your runtime after installation.
+"""
+
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip:
+`pip install scipy`. Please note that you may need to restart your runtime after installation.
+"""
+
+PRETTY_MIDI_IMPORT_ERROR = """
+{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
+`pip install pretty_midi`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYCTCDECODE_IMPORT_ERROR = """
+{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
+`pip install pyctcdecode`. Please note that you may need to restart your runtime after installation.
+"""
+
+JIEBA_IMPORT_ERROR = """
+{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
+jieba`. Please note that you may need to restart your runtime after installation.
+"""
+
+VISION_IMPORT_ERROR = """
+{0} requires the PIL library but it was not found in your environment. You can install it with pip:
+`pip install pillow`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+G2P_EN_IMPORT_ERROR = """
+{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
+`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
+"""
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("mindspore", (is_mindspore_available, MINDSPORE_IMPORT_ERROR)),
+        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
+        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
+        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+        ("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    """
+    Function to check if the specified backends are available for the given object.
+    
+    Args:
+        obj (object): The object for which backends availability needs to be checked.
+        backends (list or tuple or str): The backend(s) to be checked for availability. Can be a single backend as a string or a list/tuple of backends.
+    
+    Returns:
+        None. This function does not return any value.
+    
+    Raises:
+        ImportError: If any of the specified backends are not available for the object.
+    """
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+    def __getattribute__(cls, key):
+        """
+        This method is called automatically when an attribute is accessed on the 'DummyObject' class or any of its subclasses.
+        
+        Args:
+            cls (type): The class object that the method was called on.
+            key (str): The name of the attribute being accessed.
+        
+        Returns:
+            None: This method does not return any value.
+        
+        Raises:
+            None: This method does not raise any exceptions.
+        """
+        if key.startswith("_") and key != "_from_config":
+            return super().__getattribute__(key)
+        requires_backends(cls, cls._backends)
+
+
+def mindspore_required(func):
+    """
+    This function decorates another function to require the presence of MindSpore framework. 
+    
+    Args:
+        func (function): The function to be decorated. 
+    
+    Returns:
+        None. The function returns None.
+    
+    Raises:
+        FutureWarning: If the method `torch_required` is deprecated. 
+        ImportError: If the decorated function requires MindSpore but MindSpore is not available.
+    """
+    warnings.warn(
+        "The method `torch_required` is deprecated. Use `requires_backends` instead.",
+        FutureWarning,
+    )
+
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_mindspore_available():
+            return func(*args, **kwargs)
+        raise ImportError(f"Method `{func.__name__}` requires MindSpore.")
+
+    return wrapper
+
+
+class OptionalDependencyNotAvailable(BaseException):
+    """Internally used error class for signalling an optional dependency was not found."""
+def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
+    """Imports transformers directly
+
+    Args:
+        path (`str`): The path to the source file
+        file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
+
+    Returns:
+        `ModuleType`: The resulting imported module
+    """
+    name = "mindnlp.transformers"
+    location = os.path.join(path, file)
+    spec = importlib.util.spec_from_file_location(
+        name, location, submodule_search_locations=[path]
+    )
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    module = sys.modules[name]
+    return module
+
+
+def is_soundfile_availble():
+    return _soundfile_available
+
+
+def is_speech_available():
+    return _torchaudio_available
+
+
+def is_kenlm_available():
+    return _kenlm_available
\ No newline at end of file
diff --git a/mindnlp/utils/logging.py b/mindnlp/utils/logging.py
new file mode 100644
index 000000000..5cd14e8b2
--- /dev/null
+++ b/mindnlp/utils/logging.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+# Copyright 2020 Optuna, Hugging Face
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# pylint: disable=unused-import
+""" Logging utilities."""
+
+import functools
+import logging
+import os
+import sys
+import threading
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
+from logging import captureWarnings as _captureWarnings
+from typing import Optional
+
+from tqdm import auto as tqdm_lib
+
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "detail": logging.DEBUG,  # will also print filename and line number
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = True
+
+
+def _get_default_logging_level():
+    """
+    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        logging.getLogger().warning(
+            f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
+            f"has to be one of: { ', '.join(log_levels.keys()) }"
+        )
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    """
+    Returns the name of the library based on the module name.
+    
+    Returns:
+        str: The name of the library extracted from the module name.
+    
+    """
+    return __name__.split(".")[0] # pylint: disable=use-maxsplit-arg
+
+
+def _get_library_root_logger() -> logging.Logger:
+    """
+    Retrieves the root logger for the library.
+    
+    Returns:
+        A logging.Logger object representing the root logger for the library.
+    
+    Raises:
+        None.
+    """
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    """
+    This function configures the root logger for the library.
+    
+    Returns:
+        None: This function does not return any value.
+    
+    Raises:
+        None
+    """
+    global _default_handler
+
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
+        if sys.stderr is None:
+            sys.stderr = open(os.devnull, "w")
+
+        _default_handler.flush = sys.stderr.flush
+
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        # if logging level is debug, we add pathname and lineno to formatter for easy debugging
+        if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
+            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
+            _default_handler.setFormatter(formatter)
+
+        library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+    """
+    Resets the root logger of the library to its default state.
+    
+    Args:
+        None
+    
+    Returns:
+        None. The function does not return any value.
+    
+    Raises:
+        None
+    """
+    global _default_handler
+
+    with _lock:
+        if not _default_handler:
+            return
+
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+
+
+def get_log_levels_dict():
+    """
+    Returns a dictionary of log levels.
+    
+    Returns:
+        dict: A dictionary containing log levels and their corresponding values.
+    """
+    return log_levels
+
+
+def captureWarnings(capture):
+    """
+    Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
+    `warnings` library.
+
+    Read more about this method here:
+    https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
+
+    All warnings will be logged through the `py.warnings` logger.
+
+    Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
+    level of that logger to the library's root logger.
+    """
+    logger = get_logger("py.warnings")
+
+    if not logger.handlers:
+        logger.addHandler(_default_handler)
+
+    logger.setLevel(_get_library_root_logger().level)
+
+    _captureWarnings(capture)
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+    """
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Transformers's root logger as an int.
+
+    Returns:
+        `int`: The logging level.
+
+    <Tip>
+
+    🤗 Transformers has following logging levels:
+
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Transformers's root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level, e.g., one of:
+
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+    """Disable the default handler of the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+    """Enable the default handler of the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+
+    assert handler is not None and handler not in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+    prevent double logging if the root logger has been configured.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+    ```
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+    """
+    Resets the formatting for HuggingFace Transformers's loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("NO_ADVISORY_WARNINGS", False) # pylint: disable=invalid-envvar-default
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+@functools.lru_cache(None)
+def warning_once(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_once = warning_once
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+    def __init__(self, *args, **kwargs):
+        """
+        Initializes an instance of the EmptyTqdm class.
+        
+        Args:
+            self: The instance of the EmptyTqdm class.
+        
+        Returns:
+            None. This method does not return any value.
+        
+        Raises:
+            None.
+        """
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        """
+        This method implements the iterator protocol for the EmptyTqdm class.
+        
+        Args:
+            self: EmptyTqdm object. The instance of the EmptyTqdm class for which the iterator is being created.
+        
+        Returns:
+            None. This method returns an iterator object that iterates over the _iterator attribute of the EmptyTqdm instance.
+        
+        Raises:
+            No specific exceptions are raised by this method.
+        """
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+        def empty_fn(*args, **kwargs):
+            return
+        return empty_fn
+
+    def __enter__(self):
+        """
+        __enter__
+        
+        Args:
+            self: EmptyTqdm
+                The self parameter refers to the current instance of the EmptyTqdm class.
+        
+        Returns:
+            None
+                This method returns None.
+        
+        Raises:
+            No exceptions are raised by this method.
+        """
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        """
+        __exit__ method in the EmptyTqdm class.
+        
+        Args:
+            self: EmptyTqdm object
+                The instance of the EmptyTqdm class.
+            type_: type
+                The type of the exception. It represents the type of the exception being handled.
+            value: exception
+                The exception that was raised. It represents the actual exception object.
+            traceback: traceback
+                The traceback object. It represents the traceback information associated with the exception.
+        
+        Returns:
+            None
+            This method does not return any value.
+        
+        Raises:
+            This method does not raise any exceptions explicitly.
+        """
+        return
+
+
+class _tqdm_cls:
+
+    """_tqdm_cls is a Python class that provides functionality for managing the progress of tasks. It includes methods for calling the class, setting a lock, and getting a lock. This class is designed to work
+in conjunction with the tqdm_lib module for displaying progress bars during iterative processes. When _tqdm_active is True, the class uses methods from the tqdm_lib.tqdm module to handle progress tracking.
+Otherwise, it falls back to using an EmptyTqdm instance for progress tracking. The set_lock method allows users to specify a lock for thread safety, and the get_lock method retrieves the current lock if one
+has been set."""
+    def __call__(self, *args, **kwargs):
+        """
+        This method __call__ in the class _tqdm_cls is used to conditionally return either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
+        
+        Args:
+            self (object): The instance of the _tqdm_cls class. It is used to access the attributes and methods of the class.
+        
+        Returns:
+            None: This method does not explicitly return any value. It returns either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
+        
+        Raises:
+            No specific exceptions are raised by this method under normal circumstances. However, if there are issues related to the instantiation of tqdm objects or EmptyTqdm objects, standard Python
+exceptions may be raised.
+        """
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        """
+        Method to set the lock for the _tqdm_cls instance.
+        
+        Args:
+            self (_tqdm_cls): The instance of the _tqdm_cls class.
+                This parameter is required to access the instance and set the lock.
+                It is of type _tqdm_cls and represents the instance on which the lock is being set.
+        
+        Returns:
+            None: This method does not return any value. The lock is set within the instance itself.
+        
+        Raises:
+            No specific exceptions are raised by this method.
+            However, if _tqdm_active is False, the method will not set the lock and will return without any further action.
+        """
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        """
+        This method is used to retrieve the lock used by the _tqdm_cls class.
+        
+        Args:
+            self (object): The instance of the _tqdm_cls class.
+            
+        Returns:
+            None: This method does not return any value.
+        
+        Raises:
+            N/A
+        """
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active # pylint: disable=global-variable-not-assigned
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+
+
+def disable_progress_bar():
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False
\ No newline at end of file
diff --git a/mindnlp/utils/safetensors_patch.py b/mindnlp/utils/safetensors_patch.py
index 740b7ba5c..2d32c56e5 100644
--- a/mindnlp/utils/safetensors_patch.py
+++ b/mindnlp/utils/safetensors_patch.py
@@ -6,6 +6,7 @@
 from mindspore import Tensor
 
 from mindnlp.core.configs import SUPPORT_BF16
+import safetensors
 
 if SUPPORT_BF16:
     from mindspore.common.np_dtype import bfloat16  # pylint: disable=import-error
@@ -98,6 +99,7 @@ def start_offset(self):
         return self.base_ptr + self.info["data_offsets"][0]
 
     def get_shape(self):
+        print('get_shape', self.shape)
         return self.shape
 
     def get_dtype(self):
@@ -190,7 +192,9 @@ def __exit__(self, *args):
         self.file.close()
 
     def metadata(self):
-        return self.__metadata__
+        meta =  self.__metadata__
+        meta['format'] = 'pt'
+        return meta
 
     def keys(self):
         return list(self.tensors.keys())
@@ -201,6 +205,27 @@ def get_tensor(self, name):
     def get_slice(self, name):
         return self.tensors[name]
 
+def safe_save_file(tensor_dict, filename, metadata=None):
+    """
+    Function to safely save a dictionary of tensors to a file.
+    
+    Args:
+        tensor_dict (dict): A dictionary where keys are strings and values are numpy arrays representing tensors.
+        filename (str): The name of the file where the tensor data will be saved.
+        metadata (optional): Additional metadata to be saved along with the tensor data. Default is None.
+    
+    Returns:
+        None. The function does not return any value explicitly.
+    
+    Raises:
+        ValueError: If the input tensor_dict is not in the expected format.
+        IOError: If there are issues with writing the data to the specified file.
+        Exception: Any other unexpected error that may occur during the process.
+    """
+    tensor_dict = {k: v.asnumpy() for k, v in tensor_dict.items()}
+    return safetensors.numpy.save_file(tensor_dict, filename, metadata)
+
 def setup_safetensors_patch():
-    import safetensors
     safetensors.safe_open = fast_safe_open
+    from safetensors import torch
+    torch.save_file = safe_save_file
diff --git a/mindnlp/utils/testing_utils.py b/mindnlp/utils/testing_utils.py
index d819c9a1a..4a9167541 100644
--- a/mindnlp/utils/testing_utils.py
+++ b/mindnlp/utils/testing_utils.py
@@ -47,7 +47,6 @@
 
 from transformers.utils.import_utils import (
     is_pytest_available,
-    is_mindspore_available,
     is_essentia_available,
     is_librosa_available,
     is_pretty_midi_available,
@@ -55,7 +54,6 @@
     is_pyctcdecode_available,
     is_safetensors_available,
     is_sentencepiece_available,
-    is_soundfile_availble,
     is_tokenizers_available,
     is_pytesseract_available,
     is_vision_available,
@@ -65,6 +63,10 @@
     is_ftfy_available
 )
 from transformers.utils.generic import strtobool
+from .import_utils import (
+    is_mindspore_available,
+    is_soundfile_availble,
+)
 
 if is_pytest_available():
     from _pytest.doctest import (
diff --git a/mindnlp/utils/torch_proxy.py b/mindnlp/utils/torch_proxy.py
index 22e0c410a..7309b4d62 100644
--- a/mindnlp/utils/torch_proxy.py
+++ b/mindnlp/utils/torch_proxy.py
@@ -57,7 +57,7 @@ def initialize_torch_proxy():
     sys.modules["torch"] = torch_proxy
 
     # 设置必要的元数据
-    torch_proxy.__version__ = "1.13.1+mindnlp"
+    torch_proxy.__version__ = "2.1.1"
 
     return torch_proxy
 
@@ -71,9 +71,9 @@ def setup_metadata_patch():
     def patched_distribution(dist_name):
         if dist_name == "torch":
             return types.SimpleNamespace(
-                version="1.13.1+mindnlp",
-                metadata={"Name": "torch", "Version": "1.13.1+mindnlp"},
-                read_text=lambda f: f"Name: torch\nVersion: 1.13.1+mindnlp" if f == "METADATA" else None
+                version="2.1.1",
+                metadata={"Name": "torch", "Version": "2.1.1"},
+                read_text=lambda f: f"Name: torch\nVersion: 2.1.1" if f == "METADATA" else None
             )
         return orig_distribution(dist_name)
     
@@ -82,10 +82,12 @@ def patched_distributions(**kwargs):
         dists = list(orig_distributions(**kwargs))
         dists.append(types.SimpleNamespace(
             name="torch",
-            version="1.13.1+mindnlp",
-            metadata={"Name": "torch", "Version": "1.13.1+mindnlp"},
+            version="2.1.1",
+            metadata={"Name": "torch", "Version": "2.1.1"},
             files=[],
-            locate_file=lambda p: None
+            locate_file=lambda p: None,
+            _normalized_name='torch',
+            entry_points=[]
         ))
         return dists
     
diff --git a/tests/transformers/models/bert/test_modeling_bert.py b/tests/transformers/models/bert/test_modeling_bert.py
index 1eb1876dc..b93b9c108 100644
--- a/tests/transformers/models/bert/test_modeling_bert.py
+++ b/tests/transformers/models/bert/test_modeling_bert.py
@@ -49,6 +49,7 @@
         logging,
     )
 
+mindspore.set_context(pynative_synchronize=True)
 class BertModelTester:
     def __init__(
         self,
@@ -660,7 +661,7 @@ def test_sdpa_ignored_mask(self):
             pkv.append([ops.rand(1, num_heads, 3, head_dim), ops.rand(1, num_heads, 3, head_dim)])
 
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
-        inp = tokenizer("I am in Paris and", return_tensors="ms")
+        inp = tokenizer("I am in Paris and", return_tensors="pt")
 
         del inp["attention_mask"]
 
diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py
index 5b094368d..504e41d1e 100644
--- a/tests/transformers/test_modeling_common.py
+++ b/tests/transformers/test_modeling_common.py
@@ -81,9 +81,10 @@
     require_mindspore,
     slow,
 )
-from mindnlp.configs import CONFIG_NAME, GENERATION_CONFIG_NAME, SAFE_WEIGHTS_NAME, ON_ORANGE_PI
+from mindnlp.transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, SAFE_WEIGHTS_NAME
+from mindnlp.core.configs import ON_ORANGE_PI
 from mindnlp.utils.generic import ContextManagers, ModelOutput
-
+from transformers.pytorch_utils import id_tensor_storage
 
 # if is_accelerate_available():
 #     from accelerate.utils import compute_module_sizes
@@ -1690,8 +1691,8 @@ def test_tied_weights_keys(self):
             )
 
     def test_model_weights_reload_no_missing_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.save_pretrained(tmp_dir)
@@ -1699,13 +1700,12 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                 # We are nuking ALL weights on file, so every parameter should
                 # yell on load. We're going to detect if we yell too much, or too little.
                 placeholder_dict = {"tensor": mindspore.tensor([1, 2])}
-                safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "np"})
+                safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "pt"})
                 model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
 
-                prefix = f"{model_reloaded.base_model_prefix}."
                 params = dict(model_reloaded.named_parameters())
                 params.update(dict(model_reloaded.named_buffers()))
-                param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
+                param_names = set(params.keys())
 
                 missing_keys = set(infos["missing_keys"])
 
@@ -1714,12 +1714,11 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                 # counterpart is present but here there are no weights at all so we do get the warning.
                 ptrs = collections.defaultdict(list)
                 for name, tensor in model_reloaded.state_dict().items():
-                    ptrs[id(tensor)].append(name)
+                    ptrs[id_tensor_storage(tensor)].append(name)
                 tied_params = [names for _, names in ptrs.items() if len(names) > 1]
                 for group in tied_params:
-                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
                     # We remove the group from extra_missing if not all weights from group are in it
-                    if len(group - extra_missing) > 0:
+                    if len(set(group) - extra_missing) > 0:
                         extra_missing = extra_missing - set(group)
 
                 self.assertEqual(
@@ -1733,15 +1732,14 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                 # Remove nonpersistent buffers from missed_missing
                 buffers = [n for n, _ in model_reloaded.named_buffers()]
                 nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
-                nonpersistent_buffers = {
-                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
-                }
                 missed_missing = missed_missing - nonpersistent_buffers
 
                 if model_reloaded._keys_to_ignore_on_load_missing is None:
                     expected_missing = set()
                 else:
-                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
+                    expected_missing = set()
+                    for pattern in model_reloaded._keys_to_ignore_on_load_missing:
+                        expected_missing.update({k for k in param_names if re.search(pattern, k) is not None})
                 self.assertEqual(
                     missed_missing,
                     expected_missing,