mindspore-lab · lvyufeng · Aug 16, 2025 · Aug 16, 2025
diff --git a/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb b/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb
diff --git a/mindnlp/core/__init__.py b/mindnlp/core/__init__.py
@@ -50,7 +50,9 @@
 from .autograd import *
 from .ops import *
 from .serialization import load, save
-from ._bind import get_default_dtype, set_default_dtype, get_default_device
+from ._bind import get_default_dtype, set_default_dtype, get_default_device, is_autocast_enabled, set_autocast_enabled, \
+    set_autocast_dtype, get_autocast_dtype
+
 from .amp import autocast, GradScaler
 from .func import vmap
 from .configs import set_pyboost
@@ -90,25 +92,6 @@ def wrap_func(fn):
         return wrap_func(fn)
     return wrap_func
 
-AUTO_CAST_DTYE = {
-    'cuda': float16,
-    'cpu': float16,
-    'npu': float16,
-    'Ascend': float16
-}
-
-def set_autocast_dtype(device_type, dtype):
-    assert device_type in AUTO_CAST_DTYE.keys(), f'{device_type} is not in {AUTO_CAST_DTYE.keys()}'
-    AUTO_CAST_DTYE[device_type] = dtype
-
-def get_autocast_dtype(device_type):
-    return AUTO_CAST_DTYE[device_type]
-
-def get_autocast_gpu_dtype():
-    return AUTO_CAST_DTYE['cuda']
-
-def is_autocast_enabled():
-    return True
 
 def use_deterministic_algorithms(mode, *, warn_only=False):
     mindspore.set_context(deterministic='ON' if mode else 'OFF')

diff --git a/mindnlp/core/_bind.py b/mindnlp/core/_bind.py
@@ -10,17 +10,30 @@
 
 AUTO_CAST_DTYE = {
     'cuda': float16,
-    'cpu': bfloat16,
-    'npu': float16
+    'cpu': float16,
+    'npu': float16,
+    'Ascend': float16
 }
 
+AUTO_CAST_ENABLED = False
+
+def set_autocast_enabled(device, mode):
+    global AUTO_CAST_ENABLED
+    AUTO_CAST_ENABLED = mode
+
 def set_autocast_dtype(device_type, dtype):
     assert device_type in AUTO_CAST_DTYE.keys(), f'{device_type} is not in {AUTO_CAST_DTYE.keys()}'
     AUTO_CAST_DTYE[device_type] = dtype
 
 def get_autocast_dtype(device_type):
     return AUTO_CAST_DTYE[device_type]
 
+def get_autocast_gpu_dtype():
+    return AUTO_CAST_DTYE['cuda']
+
+def is_autocast_enabled(device):
+    return AUTO_CAST_ENABLED
+
 def set_default_dtype(dtype):
     """set default dtype"""
     global DEFAULT_DTYPE

diff --git a/mindnlp/core/_prims/ascend.py b/mindnlp/core/_prims/ascend.py
@@ -1,9 +1,11 @@
 import numbers
+import mindspore
 from mindspore import ops
 from mindspore.ops.auto_generate import gen_ops_prim
 from mindspore.ops.auto_generate import pyboost_inner_prim
 from mindspore._c_expression import _empty_instance
 from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
+from mindspore.ops.operations.nn_ops import AllFinite
 
 from mindnlp import core
 from mindnlp.core._C import default_generator
@@ -163,6 +165,8 @@ def reverse_v2(input, dims):
         dims = (dims,)
     return pyboost_inner_prim.reverse_v2_impl(input, dims)
 
+__all__.append('reverse_v2')
+
 adam_op = ops.Adam().set_device('Ascend')
 def raw_adam(param, exp_avg, exp_avg_sq, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad):
     # var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad
@@ -193,3 +197,44 @@ def stop_gradient(*args):
     return stop_gradient_op(*args)
 
 __all__.append('stop_gradient')
+
+# allfinite_op = AllFinite().set_device('Ascend')
+def all_finite(inputs):
+    return allfinite_op(inputs)
+
+def rsqrt_fp32(input):
+    return rsqrt(input.astype(mindspore.float32))
+
+__all__.append('rsqrt_fp32')
+
+def matmul_ext_fp16(input, other):
+    return matmul_ext(input.astype(mindspore.float16), other.astype(mindspore.float16))
+
+__all__.append('matmul_ext_fp16')
+
+def dense_fp16(input, weight, bias):
+    input = input.astype(mindspore.float16)
+    weight = weight.astype(mindspore.float16)
+    if bias is not None:
+        bias = bias.astype(mindspore.float16)
+    return dense(input, weight, bias)
+
+__all__.append('dense_fp16')
+
+def softmax_fp32(input, dim):
+    return softmax(input.astype(mindspore.float32), dim)
+
+__all__.append('softmax_fp32')
+
+def log_softmax_ext_fp32(input, dim, dtype):
+    return log_softmax_ext(input.astype(mindspore.float32), dim, dtype)
+
+__all__.append('log_softmax_ext_fp32')
+
+def one_hot_ext(tensor, num_classes):
+    on_value = core.Tensor(1, dtype=tensor.dtype)
+    off_value = core.Tensor(0, dtype=tensor.dtype)
+
+    return pyboost_inner_prim.one_hot_ext_impl(tensor, num_classes, on_value, off_value, -1)
+
+__all__.append('one_hot_ext')
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
@@ -338,8 +338,8 @@ def __bool__(self):
         return bool(self.item())
 
     def __index__(self):
-        if self.ndim > 0:
-            return self.tolist()
+        # if self.ndim > 0:
+        #     return self.tolist()
         return int(self.item())
 
     def __and__(self, other):
@@ -1216,7 +1216,7 @@ def hardshrink(self, lambd=0.5):
 
     # Tensor.index_add_
     def index_add_(self, dim, index, source, *, alpha=1):
-        return self.copy_(ops.index_add(self, dim, source, alpha=alpha))
+        return self.copy_(ops.index_add(self, dim, index, source, alpha=alpha))
 
     # Tensor.index_add
     def index_add(self, dim, index, source, *, alpha=1):

diff --git a/mindnlp/core/amp/autocast_mode.py b/mindnlp/core/amp/autocast_mode.py
@@ -10,6 +10,7 @@
 from mindspore.common.dtype import TensorType as _dtype, float32
 from mindspore.train.amp import AMP_AUTO_BLACK_LIST, AMP_AUTO_WHITE_LIST, AMP_PRIM_ARG_TABLE
 
+
 try:
     import numpy as np
 
@@ -48,6 +49,7 @@ def decorate_autocast(*args, **kwargs):
     return decorate_autocast
 
 
+
 class autocast:
 
     def __init__(
@@ -82,6 +84,7 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[ov
     def __call__(self, func):
         return autocast_decorator(self, func)
 
+
 def _cast(value, device_type: str, dtype):
     if isinstance(value, core.Tensor):
         is_eligible = (

diff --git a/mindnlp/core/amp/grad_scaler.py b/mindnlp/core/amp/grad_scaler.py
@@ -21,9 +21,14 @@ def non_finite_check(inputs):
         status = core.tensor(np.array([0] * 8), dtype=core.int32, device='npu')
         status = core.depend(status, inputs)
         found_inf = core.npu_get_float_status_v2(status)
+        print('found_inf', found_inf)
         status = core.depend(status, found_inf)
         clear_status = core.npu_clear_float_status_v2(status)
         found_inf = core.depend(found_inf, clear_status)
+        print('found_inf', found_inf)
+        found_inf = core.not_equal(found_inf, 0)
+        print('found_inf', found_inf)
+        print('after clear', core.npu_get_float_status_v2(status))
         return found_inf.sum()
 
     found_inf = core.all_finite(inputs)  # pylint: disable=invalid-unary-operand-type
@@ -302,7 +307,9 @@ def _unscale_grads_(
                     #     per_device_inv_scale.get(device),
                     # )
                     found_inf = per_device_found_inf.get(device)
+                    print('found_inf before', found_inf)
                     found_inf.copy_(non_finite_check(grads).to(found_inf.dtype))
+                    print('found_inf after', found_inf)
                     for grad in grads:
                         grad *= per_device_inv_scale.get(device)
 
@@ -372,6 +379,7 @@ def _maybe_opt_step(
         **kwargs: Any,
     ) -> Optional[float]:
         retval: Optional[float] = None
+        print(sum(v.item() for v in optimizer_state["found_inf_per_device"].values()))
         if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
             retval = optimizer.step(*args, **kwargs)
         return retval
@@ -418,7 +426,8 @@ def step(
             )
 
         retval: Optional[float] = None
-
+        print('grad scaler')
+        print(getattr(optimizer, "_step_supports_amp_scaling", False))
         if getattr(optimizer, "_step_supports_amp_scaling", False):
             # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
             # The contract with custom optimizers is that their step() should accept an additional,
@@ -478,6 +487,7 @@ def step(
             len(optimizer_state["found_inf_per_device"]) > 0
         ), "No inf checks were recorded for this optimizer."
 
+        print('_maybe_opt_step')
         retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
 
         optimizer_state["stage"] = OptState.STEPPED
@@ -514,6 +524,8 @@ def update(self, new_scale: Optional[Union[float, core.Tensor]] = None) -> None:
 
         _scale, _growth_tracker = self._check_scale_growth_tracker("update")
 
+        print('scaler update')
+        print('scaler update', new_scale is not None)
         if new_scale is not None:
             assert self._scale is not None
             # Accept a new user-defined scale.
@@ -551,7 +563,10 @@ def update(self, new_scale: Optional[Union[float, core.Tensor]] = None) -> None:
             #     self._growth_interval,
             # )
             if found_inf_combined > 0:
+                print(_scale)
                 _scale.copy_(_scale * self._backoff_factor)
+                print(_scale)
+
                 _growth_tracker.copy_(_growth_tracker * 0)
             else:
                 successful = self._growth_interval + 1

diff --git a/mindnlp/core/autograd/function.py b/mindnlp/core/autograd/function.py
@@ -43,6 +43,7 @@ def value_and_grad_f(*args, **kwargs):
         _pynative_executor.set_grad_flag(True)
         _pynative_executor.new_graph(fn_, *args, **kwargs)
         values = fn_(*args, **kwargs)
+        print('all_finite forward', mindspore.amp.all_finite(values))
         _pynative_executor.end_graph(fn_, values, *args, **kwargs)
 
         run_args = args
@@ -51,6 +52,7 @@ def value_and_grad_f(*args, **kwargs):
 
         grads = _pynative_executor.check_run(grad_, fn_, params_or_argnums, None, *run_args)
         grads = _pynative_executor.grad(fn_, grad_, params_or_argnums, None, *run_args)
+        # print('all_finite backward', mindspore.amp.all_finite(grads))
         if attach_grads:
             for param, grad in zip(params_or_argnums, grads):
                 grad = core.tensor(grad, device=param.device)