[AMDGPU] W/a for gfx940 byte0 fp8 conversion bug

VOP1 form of these do not work. Differential Revision: https://reviews.llvm.org/D157683
llvm · Aug 11, 2023 · 02046ad · 02046ad
1 parent 3e596ed
commit 02046ad
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 11 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1862,6 +1862,9 @@ def HasGDS : Predicate<"Subtarget->hasGDS()">;
 
 def HasGWS : Predicate<"Subtarget->hasGWS()">;
 
+def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
+def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1172,6 +1172,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
 
+  // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
+  bool hasCvtFP8VOP1Bug() const { return true; }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -584,18 +584,28 @@ let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
 }
 
 class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
-    VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+    VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
     (f32 (node i32:$src, index)),
-    !if (index,
-         (inst_sdwa 0, $src, 0, 0, index),
-         (inst_e32 $src))
+    (inst_sdwa 0, $src, 0, 0, index)
 >;
 
-foreach Index = [0, 1, 2, 3] in {
-  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
-                       V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
-  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
-                       V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+let OtherPredicates = [HasCvtFP8VOP1Bug] in {
+  def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
+               (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
+  def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
+               (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>;
+}
+
+let OtherPredicates = [HasNoCvtFP8VOP1Bug] in {
+  def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
+               (V_CVT_F32_FP8_e32 $src)>;
+  def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
+               (V_CVT_F32_BF8_e32 $src)>;
+}
+
+foreach Index = [1, 2, 3] in {
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
 }
 
 class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
 declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
 
 ; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0:
-; GCN: v_cvt_f32_bf8_e32 v0, v0{{$}}
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_bf8_byte0(i32 %a) {
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
   ret float %ret
@@ -38,7 +38,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
 }
 
 ; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0:
-; GCN: v_cvt_f32_fp8_e32 v0, v0{{$}}
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_fp8_byte0(i32 %a) {
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
   ret float %ret