-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Constrain readfirstlane operand to vgpr_32 #168001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Constrain readfirstlane operand to vgpr_32 #168001
Conversation
When inserting a readfirstlane, ensure the operand constraint is respected. If the source register was an av_* class, the verifier would fail. Fixes regression after c7019c7
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWhen inserting a readfirstlane, ensure the operand constraint Fixes regression after c7019c7 Full diff: https://github.com/llvm/llvm-project/pull/168001.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 7793907c032d2..27b9af2d3885f 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1122,9 +1122,20 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(VReg32);
} else if (SrcSize == 32) {
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
- MIB.addReg(SrcReg, 0, SubReg);
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
+ .addReg(SrcReg, 0, SubReg);
+
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
+ SubReg);
+
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
} else {
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll
new file mode 100644
index 0000000000000..b05b89fe503f2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
+
+; SIFixSGPRCopies will insert a readfirstlane from an AV source
+; register, which needs to be constrained by VGPR to satisfy the
+; operand constraint.
+
+define amdgpu_kernel void @constrain_readfirstlane_av(i64 %arg, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: constrain_readfirstlane_av:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b64 s[6:7], 0
+; CHECK-NEXT: s_and_b64 vcc, exec, -1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s4, v1
+; CHECK-NEXT: s_and_b32 s4, s4, 0xffff
+; CHECK-NEXT: .LBB0_1: ; %bb16
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_mul_i32 s8, s6, s1
+; CHECK-NEXT: s_mul_hi_u32 s9, s6, s0
+; CHECK-NEXT: s_mul_i32 s7, s7, s0
+; CHECK-NEXT: s_add_i32 s8, s9, s8
+; CHECK-NEXT: s_mul_i32 s6, s6, s0
+; CHECK-NEXT: s_add_i32 s7, s8, s7
+; CHECK-NEXT: s_lshl_b64 s[6:7], s[6:7], 5
+; CHECK-NEXT: s_add_u32 s6, s2, s6
+; CHECK-NEXT: s_addc_u32 s7, s3, s7
+; CHECK-NEXT: global_load_dword v1, v0, s[6:7] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_mov_b64 vcc, vcc
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT: ; %bb.2: ; %DummyReturnBlock
+; CHECK-NEXT: s_endpgm
+bb:
+ %i = load volatile i16, ptr addrspace(1) %ptr, align 2
+ %i6 = zext i16 %i to i64
+ br label %bb16
+
+bb16: ; preds = %bb16, %bb
+ %i17 = phi i64 [ %i6, %bb16 ], [ 0, %bb ]
+ %i23 = mul i64 %i17, %arg
+ %i25.split = getelementptr [16 x half], ptr addrspace(1) %ptr, i64 %i23
+ %i27 = load volatile <2 x half>, ptr addrspace(1) %i25.split, align 16
+ br label %bb16
+}
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir
new file mode 100644
index 0000000000000..ac4f41282ab73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: constrain_readfirstlane_av
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: constrain_readfirstlane_av
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]]
+ ; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]]
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0
+
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:av_32 = COPY $vgpr0
+ %2:sreg_32 = COPY %1
+ %3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc
+
+ bb.1:
+ %4:sreg_32 = S_MUL_I32 %3, %3
+ %5:sreg_32 = S_MUL_HI_U32 %3, %4
+ %6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc
+ S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+# Need to respect subregister on copy source
+---
+name: constrain_readfirstlane_av64
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: constrain_readfirstlane_av64
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]]
+ ; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]]
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:av_64 = COPY $vgpr0_vgpr1
+ %2:sreg_32 = COPY %1.sub0
+ %3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc
+
+ bb.1:
+ %4:sreg_32 = S_MUL_I32 %3, %3
+ %5:sreg_32 = S_MUL_HI_U32 %3, %4
+ %6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc
+ S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
|
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
But isn't there a helper somewhere that would do the subreg handling for you? It feels like there should be a "constrain this operand" helper to do it all.
There is a globalisel flavored one which won't work here. I couldn't find another |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/27315 Here is the relevant piece of the build log for the reference |

When inserting a readfirstlane, ensure the operand constraint
is respected. If the source register was an av_* class, the
verifier would fail.
Fixes regression after c7019c7