-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Support GFX12 VDSDIR instructions WAITVMSRC operand in GCNHazardRecognizer #77628
Conversation
…zardRecognizer Modify GCNHazardRecognizer::fixLdsDirectVMEMHazard() so the waitvsrc operand in gfx12 DS_PARAM_LOAD or DS_DIRECT_LOAD instructions is set appropriately depending on whether a hazard is found or not, rather than inserting an S_WAITCNT_DEPCTR instruction if a hazard needs to be mitigated.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesModify GCNHazardRecognizer::fixLdsDirectVMEMHazard() so the waitvsrc operand Patch is 24.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77628.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a7d8ff0242b801..bcd93e30d6c2d1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1450,20 +1450,27 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
return false;
return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
};
- auto IsExpiredFn = [](const MachineInstr &I, int) {
+ bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
+ auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0);
+ AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
+ (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
+ !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
std::numeric_limits<int>::max())
return false;
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ if (LdsdirCanWait) {
+ TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
+ } else {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..85d062a9a6f5e8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1128,6 +1128,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasLdsDirect() const { return getGeneration() >= GFX11; }
+ bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
+
bool hasVALUPartialForwardingHazard() const {
return getGeneration() >= GFX11;
}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx11.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/lds-direct-hazards.mir
rename to llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx11.mir
diff --git a/llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx12.mir b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx12.mir
new file mode 100644
index 00000000000000..1ef6ce88e61106
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx12.mir
@@ -0,0 +1,391 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: lds_param_load_no_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_no_war
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 0, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst0_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst0_war
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 0, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst0_war_salu
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst0_war_salu
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $m0 = S_MOV_B32 killed $sgpr0
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 0, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $m0 = S_MOV_B32 killed $sgpr0
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst1_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst1_war
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 1, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst10_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst10_war
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 10, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst10_waw
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst10_waw
+ ; GCN: $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 10, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_va_vdst20_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_va_vdst20_war
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_valu_war_trans
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_valu_war_trans
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 0, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_trans_war_valu
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_trans_war_valu
+ ; GCN: $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 0, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_valu_war_vmem
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_valu_war_vmem
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_valu_war_lds
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_valu_war_lds
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_valu_war_ldsdir
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_valu_war_ldsdir
+ ; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr10 = DS_PARAM_LOAD 0, 1, 15, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 4, 1, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr10 = DS_PARAM_LOAD 0, 1, 15, 1, implicit $m0, implicit $exec
+ $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 4, 1, implicit $m0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: lds_param_load_vmem_war
+body: |
+ bb.0:
+ ; GCN-LABEL: name: lds_param_load_vmem_war
+ ; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
+ ; GCN-NEXT: $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+ $vgpr1 = DS_PARAM_LOAD 0, 0, 15, 1, implicit $m0, implicit $exec
+ S_EN...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…zardRecognizer (llvm#77628) Modify GCNHazardRecognizer::fixLdsDirectVMEMHazard() so the waitvsrc operand in gfx12 DS_PARAM_LOAD or DS_DIRECT_LOAD instructions is set appropriately depending on whether a hazard is found or not, rather than inserting an S_WAITCNT_DEPCTR instruction if a hazard needs to be mitigated. Co-authored-by: Stephen Thomas <Stephen.Thomas@amd.com>
Modify GCNHazardRecognizer::fixLdsDirectVMEMHazard() so the waitvsrc operand
in gfx12 DS_PARAM_LOAD or DS_DIRECT_LOAD instructions is set appropriately
depending on whether a hazard is found or not, rather than inserting an
S_WAITCNT_DEPCTR instruction if a hazard needs to be mitigated.