diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 7793907c032d2..27b9af2d3885f 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1122,9 +1122,20 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) .addReg(VReg32); } else if (SrcSize == 32) { - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); - MIB.addReg(SrcReg, 0, SubReg); + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg) + .addReg(SrcReg, 0, SubReg); + + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC, + SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else { auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), DstReg); diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll new file mode 100644 index 0000000000000..b05b89fe503f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s + +; SIFixSGPRCopies will insert a readfirstlane from an AV source +; register, which needs to be constrained by VGPR to satisfy the +; operand constraint. + +define amdgpu_kernel void @constrain_readfirstlane_av(i64 %arg, ptr addrspace(1) %ptr) { +; CHECK-LABEL: constrain_readfirstlane_av: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_ushort v1, v0, s[2:3] glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: s_and_b32 s4, s4, 0xffff +; CHECK-NEXT: .LBB0_1: ; %bb16 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_mul_i32 s8, s6, s1 +; CHECK-NEXT: s_mul_hi_u32 s9, s6, s0 +; CHECK-NEXT: s_mul_i32 s7, s7, s0 +; CHECK-NEXT: s_add_i32 s8, s9, s8 +; CHECK-NEXT: s_mul_i32 s6, s6, s0 +; CHECK-NEXT: s_add_i32 s7, s8, s7 +; CHECK-NEXT: s_lshl_b64 s[6:7], s[6:7], 5 +; CHECK-NEXT: s_add_u32 s6, s2, s6 +; CHECK-NEXT: s_addc_u32 s7, s3, s7 +; CHECK-NEXT: global_load_dword v1, v0, s[6:7] glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b64 vcc, vcc +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %DummyReturnBlock +; CHECK-NEXT: s_endpgm +bb: + %i = load volatile i16, ptr addrspace(1) %ptr, align 2 + %i6 = zext i16 %i to i64 + br label %bb16 + +bb16: ; preds = %bb16, %bb + %i17 = phi i64 [ %i6, %bb16 ], [ 0, %bb ] + %i23 = mul i64 %i17, %arg + %i25.split = getelementptr [16 x half], ptr addrspace(1) %ptr, i64 %i23 + %i27 = load volatile <2 x half>, ptr addrspace(1) %i25.split, align 16 + br label %bb16 +} + + diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir new file mode 100644 index 0000000000000..ac4f41282ab73 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: constrain_readfirstlane_av +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: constrain_readfirstlane_av + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]] + ; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]] + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + + %0:sreg_32 = IMPLICIT_DEF + %1:av_32 = COPY $vgpr0 + %2:sreg_32 = COPY %1 + %3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc + + bb.1: + %4:sreg_32 = S_MUL_I32 %3, %3 + %5:sreg_32 = S_MUL_HI_U32 %3, %4 + %6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... + +# Need to respect subregister on copy source +--- +name: constrain_readfirstlane_av64 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: constrain_readfirstlane_av64 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]] + ; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]] + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + + %0:sreg_32 = IMPLICIT_DEF + %1:av_64 = COPY $vgpr0_vgpr1 + %2:sreg_32 = COPY %1.sub0 + %3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc + + bb.1: + %4:sreg_32 = S_MUL_I32 %3, %3 + %5:sreg_32 = S_MUL_HI_U32 %3, %4 + %6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... +