Merge branch 'master' into fix-hpz-with-zero-elt

microsoft · Jun 17, 2024 · b8dacfe · b8dacfe
2 parents 025ddb0 + 7331630
commit b8dacfe
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 2 deletions.
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
@@ -159,7 +159,10 @@ def range_pop(self):
         return
 
     def lazy_call(self, callback):
-        return torch.xpu.lazy_init._lazy_call(callback)
+        if hasattr(torch.xpu, "_lazy_call"):
+            return torch.xpu._lazy_call(callback)
+        else:
+            return torch.xpu.lazy_init._lazy_call(callback)
 
     def communication_backend_name(self):
         return self._communication_backend_name

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -742,7 +742,7 @@ def _exec_forward_pass(self, buffer_id):
                 raise ValueError("expecting a tensor or a tuple of tensors")
             part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
-            first_output.data = torch.zeros(1)
+            first_output.data = torch.zeros(1, device=first_output.data.device)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
             # Inject the partitioned tensor into the output before sending
             outputs = (part.to_meta(), part.data(), *outputs_tail)