microsoft · lekurile · Sep 14, 2023 · Sep 14, 2023
@@ -247,6 +247,11 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
 
         offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0
         target_dtype = torch.float16 if self.config.dtype == torch.int8 else self.config.dtype
+
+        # When using the hybrid engine with BLOOM, input_mask needs to be converted from torch.bool -> torch.int64
+        if input_mask.dtype == torch.bool:
+            input_mask = input_mask.long()
+
         attention_probs = self.softmax_func(attn_scores=attention_scores,
                                             attn_mask=((1 - input_mask).to(target_dtype) * minus_inf),
                                             alibi=alibi,