diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index d07909251dcfb..fe3ba31916c3f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -190,6 +190,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (checkFPAtomicToDenormModeHazard(MI) > 0) return HazardType; + // Hazards which cannot be mitigated with S_NOPs. + if (!IsHazardRecognizerMode) { + if (checkWMMACoexecutionHazards(MI) > 0) + return Hazard; + } + if (ST.hasNoDataDepHazard()) return NoHazard; diff --git a/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir b/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir new file mode 100644 index 0000000000000..e3c8acc837f09 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/misched-into-wmma-hazard-shadow.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=postmisched,post-RA-hazard-rec %s -o - | FileCheck --check-prefix=GCN %s + +# Bring all independent V_LSHL_ADD_U32_e64 instructions into the shadow +# of the WMMA so then hazard recognizer only need to insert 4 V_NOP_e32 +# instructions instead of 8. + +--- +name: test_wmma_scale_f32_16x16x128_f8f6f4_shadow_sched +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + + ; GCN-LABEL: name: test_wmma_scale_f32_16x16x128_f8f6f4_shadow_sched + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr46 = V_LSHL_ADD_U32_e64 killed $vgpr43, 1, $vgpr43, implicit $exec + ; GCN-NEXT: $vgpr47 = V_LSHL_ADD_U32_e64 killed $vgpr42, 1, $vgpr42, implicit $exec + ; GCN-NEXT: $vgpr48 = V_LSHL_ADD_U32_e64 killed $vgpr41, 1, $vgpr41, implicit $exec + ; GCN-NEXT: $vgpr49 = V_LSHL_ADD_U32_e64 killed $vgpr40, 1, $vgpr40, implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 1065353216, killed $vgpr4, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = V_ADD_F32_e32 1065353216, killed $vgpr5, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = V_ADD_F32_e32 1065353216, killed $vgpr6, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = V_ADD_F32_e32 1065353216, killed $vgpr7, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr8 = V_ADD_F32_e32 1065353216, killed $vgpr8, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr9 = V_ADD_F32_e32 1065353216, killed $vgpr9, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr10 = V_ADD_F32_e32 1065353216, killed $vgpr10, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr11 = V_ADD_F32_e32 1065353216, killed $vgpr11, implicit $mode, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, 32, 0, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 killed renamable $vgpr44_vgpr45, killed renamable $vgpr46_vgpr47_vgpr48_vgpr49, 64, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = V_ADD_F32_e32 1065353216, $vgpr4, implicit $mode, implicit $exec + $vgpr5 = V_ADD_F32_e32 1065353216, $vgpr5, implicit $mode, implicit $exec + $vgpr6 = V_ADD_F32_e32 1065353216, $vgpr6, implicit $mode, implicit $exec + $vgpr7 = V_ADD_F32_e32 1065353216, $vgpr7, implicit $mode, implicit $exec + $vgpr8 = V_ADD_F32_e32 1065353216, $vgpr8, implicit $mode, implicit $exec + $vgpr9 = V_ADD_F32_e32 1065353216, $vgpr9, implicit $mode, implicit $exec + $vgpr10 = V_ADD_F32_e32 1065353216, $vgpr10, implicit $mode, implicit $exec + $vgpr11 = V_ADD_F32_e32 1065353216, $vgpr11, implicit $mode, implicit $exec + $vgpr46 = V_LSHL_ADD_U32_e64 $vgpr43, 1, $vgpr43, implicit $exec + $vgpr47 = V_LSHL_ADD_U32_e64 $vgpr42, 1, $vgpr42, implicit $exec + $vgpr48 = V_LSHL_ADD_U32_e64 $vgpr41, 1, $vgpr41, implicit $exec + $vgpr49 = V_LSHL_ADD_U32_e64 $vgpr40, 1, $vgpr40, implicit $exec + GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, 32, 0, implicit $exec + GLOBAL_STORE_DWORDX4 renamable $vgpr44_vgpr45, killed renamable $vgpr46_vgpr47_vgpr48_vgpr49, 64, 0, implicit $exec + S_ENDPGM 0 +...