Skip to content

Commit

Permalink
support sharding in fp16 on xpu, (PaddlePaddle#48897)
Browse files Browse the repository at this point in the history
* support sharding in fp16 on xpu, change reduce_max to reduce_sum for found nan or inf

* update
  • Loading branch information
sljlp committed Dec 12, 2022
1 parent 28db99a commit 789e764
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ def unscale_method(self, optimizer):
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))

device = "cpu" if optimizer.offload else "gpu"
device = paddle.get_device().split(":")[0]
device = "cpu" if optimizer.offload else device
dev_id = (
0 if device == "cpu" else int(paddle.get_device().split(":")[1])
)
Expand All @@ -245,8 +246,9 @@ def unscale_method(self, optimizer):
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")

paddle.distributed.all_reduce(
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
)

self._found_inf = is_found_inf.numpy()[0]

scaler._unscale = MethodType(unscale_method, scaler)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def nan_inf(self):
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
self.assertEqual(scaler._found_inf.numpy() >= 1, True)

for param in model.parameters():
# param not update when tensor contains nan or inf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def nan_inf(self):
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
self.assertEqual(scaler._found_inf.numpy() >= 1, True)

for param in model.parameters():
# param not update when tensor contains nan or inf
Expand Down

0 comments on commit 789e764

Please sign in to comment.