Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11,198 changes: 10,693 additions & 505 deletions examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb

Large diffs are not rendered by default.

23 changes: 3 additions & 20 deletions mindnlp/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
from .autograd import *
from .ops import *
from .serialization import load, save
from ._bind import get_default_dtype, set_default_dtype, get_default_device
from ._bind import get_default_dtype, set_default_dtype, get_default_device, is_autocast_enabled, set_autocast_enabled, \
set_autocast_dtype, get_autocast_dtype

from .amp import autocast, GradScaler
from .func import vmap
from .configs import set_pyboost
Expand Down Expand Up @@ -90,25 +92,6 @@ def wrap_func(fn):
return wrap_func(fn)
return wrap_func

AUTO_CAST_DTYE = {
'cuda': float16,
'cpu': float16,
'npu': float16,
'Ascend': float16
}

def set_autocast_dtype(device_type, dtype):
assert device_type in AUTO_CAST_DTYE.keys(), f'{device_type} is not in {AUTO_CAST_DTYE.keys()}'
AUTO_CAST_DTYE[device_type] = dtype

def get_autocast_dtype(device_type):
return AUTO_CAST_DTYE[device_type]

def get_autocast_gpu_dtype():
return AUTO_CAST_DTYE['cuda']

def is_autocast_enabled():
return True

def use_deterministic_algorithms(mode, *, warn_only=False):
mindspore.set_context(deterministic='ON' if mode else 'OFF')
Expand Down
17 changes: 15 additions & 2 deletions mindnlp/core/_bind.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,30 @@

AUTO_CAST_DTYE = {
'cuda': float16,
'cpu': bfloat16,
'npu': float16
'cpu': float16,
'npu': float16,
'Ascend': float16
}

AUTO_CAST_ENABLED = False

def set_autocast_enabled(device, mode):
global AUTO_CAST_ENABLED
AUTO_CAST_ENABLED = mode

def set_autocast_dtype(device_type, dtype):
assert device_type in AUTO_CAST_DTYE.keys(), f'{device_type} is not in {AUTO_CAST_DTYE.keys()}'
AUTO_CAST_DTYE[device_type] = dtype

def get_autocast_dtype(device_type):
return AUTO_CAST_DTYE[device_type]

def get_autocast_gpu_dtype():
return AUTO_CAST_DTYE['cuda']

def is_autocast_enabled(device):
return AUTO_CAST_ENABLED

def set_default_dtype(dtype):
"""set default dtype"""
global DEFAULT_DTYPE
Expand Down
45 changes: 45 additions & 0 deletions mindnlp/core/_prims/ascend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import numbers
import mindspore
from mindspore import ops
from mindspore.ops.auto_generate import gen_ops_prim
from mindspore.ops.auto_generate import pyboost_inner_prim
from mindspore._c_expression import _empty_instance
from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2
from mindspore.ops.operations.nn_ops import AllFinite

from mindnlp import core
from mindnlp.core._C import default_generator
Expand Down Expand Up @@ -163,6 +165,8 @@ def reverse_v2(input, dims):
dims = (dims,)
return pyboost_inner_prim.reverse_v2_impl(input, dims)

__all__.append('reverse_v2')

adam_op = ops.Adam().set_device('Ascend')
def raw_adam(param, exp_avg, exp_avg_sq, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad):
# var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad
Expand Down Expand Up @@ -193,3 +197,44 @@ def stop_gradient(*args):
return stop_gradient_op(*args)

__all__.append('stop_gradient')

# allfinite_op = AllFinite().set_device('Ascend')
def all_finite(inputs):
return allfinite_op(inputs)

def rsqrt_fp32(input):
return rsqrt(input.astype(mindspore.float32))

__all__.append('rsqrt_fp32')

def matmul_ext_fp16(input, other):
return matmul_ext(input.astype(mindspore.float16), other.astype(mindspore.float16))

__all__.append('matmul_ext_fp16')

def dense_fp16(input, weight, bias):
input = input.astype(mindspore.float16)
weight = weight.astype(mindspore.float16)
if bias is not None:
bias = bias.astype(mindspore.float16)
return dense(input, weight, bias)

__all__.append('dense_fp16')

def softmax_fp32(input, dim):
return softmax(input.astype(mindspore.float32), dim)

__all__.append('softmax_fp32')

def log_softmax_ext_fp32(input, dim, dtype):
return log_softmax_ext(input.astype(mindspore.float32), dim, dtype)

__all__.append('log_softmax_ext_fp32')

def one_hot_ext(tensor, num_classes):
on_value = core.Tensor(1, dtype=tensor.dtype)
off_value = core.Tensor(0, dtype=tensor.dtype)

return pyboost_inner_prim.one_hot_ext_impl(tensor, num_classes, on_value, off_value, -1)

__all__.append('one_hot_ext')
6 changes: 3 additions & 3 deletions mindnlp/core/_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,8 @@ def __bool__(self):
return bool(self.item())

def __index__(self):
if self.ndim > 0:
return self.tolist()
# if self.ndim > 0:
# return self.tolist()
return int(self.item())

def __and__(self, other):
Expand Down Expand Up @@ -1216,7 +1216,7 @@ def hardshrink(self, lambd=0.5):

# Tensor.index_add_
def index_add_(self, dim, index, source, *, alpha=1):
return self.copy_(ops.index_add(self, dim, source, alpha=alpha))
return self.copy_(ops.index_add(self, dim, index, source, alpha=alpha))

# Tensor.index_add
def index_add(self, dim, index, source, *, alpha=1):
Expand Down
3 changes: 3 additions & 0 deletions mindnlp/core/amp/autocast_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from mindspore.common.dtype import TensorType as _dtype, float32
from mindspore.train.amp import AMP_AUTO_BLACK_LIST, AMP_AUTO_WHITE_LIST, AMP_PRIM_ARG_TABLE


try:
import numpy as np

Expand Down Expand Up @@ -48,6 +49,7 @@ def decorate_autocast(*args, **kwargs):
return decorate_autocast



class autocast:

def __init__(
Expand Down Expand Up @@ -82,6 +84,7 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): # type: ignore[ov
def __call__(self, func):
return autocast_decorator(self, func)


def _cast(value, device_type: str, dtype):
if isinstance(value, core.Tensor):
is_eligible = (
Expand Down
17 changes: 16 additions & 1 deletion mindnlp/core/amp/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,14 @@ def non_finite_check(inputs):
status = core.tensor(np.array([0] * 8), dtype=core.int32, device='npu')
status = core.depend(status, inputs)
found_inf = core.npu_get_float_status_v2(status)
print('found_inf', found_inf)
status = core.depend(status, found_inf)
clear_status = core.npu_clear_float_status_v2(status)
found_inf = core.depend(found_inf, clear_status)
print('found_inf', found_inf)
found_inf = core.not_equal(found_inf, 0)
print('found_inf', found_inf)
print('after clear', core.npu_get_float_status_v2(status))
return found_inf.sum()

found_inf = core.all_finite(inputs) # pylint: disable=invalid-unary-operand-type
Expand Down Expand Up @@ -302,7 +307,9 @@ def _unscale_grads_(
# per_device_inv_scale.get(device),
# )
found_inf = per_device_found_inf.get(device)
print('found_inf before', found_inf)
found_inf.copy_(non_finite_check(grads).to(found_inf.dtype))
print('found_inf after', found_inf)
for grad in grads:
grad *= per_device_inv_scale.get(device)

Expand Down Expand Up @@ -372,6 +379,7 @@ def _maybe_opt_step(
**kwargs: Any,
) -> Optional[float]:
retval: Optional[float] = None
print(sum(v.item() for v in optimizer_state["found_inf_per_device"].values()))
if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
retval = optimizer.step(*args, **kwargs)
return retval
Expand Down Expand Up @@ -418,7 +426,8 @@ def step(
)

retval: Optional[float] = None

print('grad scaler')
print(getattr(optimizer, "_step_supports_amp_scaling", False))
if getattr(optimizer, "_step_supports_amp_scaling", False):
# This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
# The contract with custom optimizers is that their step() should accept an additional,
Expand Down Expand Up @@ -478,6 +487,7 @@ def step(
len(optimizer_state["found_inf_per_device"]) > 0
), "No inf checks were recorded for this optimizer."

print('_maybe_opt_step')
retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)

optimizer_state["stage"] = OptState.STEPPED
Expand Down Expand Up @@ -514,6 +524,8 @@ def update(self, new_scale: Optional[Union[float, core.Tensor]] = None) -> None:

_scale, _growth_tracker = self._check_scale_growth_tracker("update")

print('scaler update')
print('scaler update', new_scale is not None)
if new_scale is not None:
assert self._scale is not None
# Accept a new user-defined scale.
Expand Down Expand Up @@ -551,7 +563,10 @@ def update(self, new_scale: Optional[Union[float, core.Tensor]] = None) -> None:
# self._growth_interval,
# )
if found_inf_combined > 0:
print(_scale)
_scale.copy_(_scale * self._backoff_factor)
print(_scale)

_growth_tracker.copy_(_growth_tracker * 0)
else:
successful = self._growth_interval + 1
Expand Down
2 changes: 2 additions & 0 deletions mindnlp/core/autograd/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def value_and_grad_f(*args, **kwargs):
_pynative_executor.set_grad_flag(True)
_pynative_executor.new_graph(fn_, *args, **kwargs)
values = fn_(*args, **kwargs)
print('all_finite forward', mindspore.amp.all_finite(values))
_pynative_executor.end_graph(fn_, values, *args, **kwargs)

run_args = args
Expand All @@ -51,6 +52,7 @@ def value_and_grad_f(*args, **kwargs):

grads = _pynative_executor.check_run(grad_, fn_, params_or_argnums, None, *run_args)
grads = _pynative_executor.grad(fn_, grad_, params_or_argnums, None, *run_args)
# print('all_finite backward', mindspore.amp.all_finite(grads))
if attach_grads:
for param, grad in zip(params_or_argnums, grads):
grad = core.tensor(grad, device=param.device)
Expand Down
Loading
Loading