diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 84984a0871dac..fdc55a4ef62e6 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -730,14 +730,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } } - // Rework once the VS_16 register class is updated to include proper - // 16-bit SGPRs instead of 32-bit ones. - if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) - Old.setSubReg(AMDGPU::NoSubRegister); + Old.setSubReg(New->getSubReg()); if (New->getReg().isPhysical()) { Old.substPhysReg(New->getReg(), *TRI); } else { - Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); + Old.substVirtReg(New->getReg(), 0, *TRI); Old.setIsUndef(New->isUndef()); } return true; @@ -1150,10 +1147,14 @@ void SIFoldOperandsImpl::foldOperand( if (UseOp->isReg() && OpToFold.isReg()) { if (UseOp->isImplicit()) return; - // Allow folding from SGPRs to 16-bit VGPRs. + + MachineInstr *SourceInstruction = MRI->getVRegDef(UseOp->getReg()); + // Allow folding from SGPRs to 16-bit VGPRs + // or folding of non-subregs through REG_SEQUENCES. if (UseOp->getSubReg() != AMDGPU::NoSubRegister && (UseOp->getSubReg() != AMDGPU::lo16 || - !TRI->isSGPRReg(*MRI, OpToFold.getReg()))) + !TRI->isSGPRReg(*MRI, OpToFold.getReg())) && + !SourceInstruction->isRegSequence()) return; } @@ -1452,6 +1453,35 @@ void SIFoldOperandsImpl::foldOperand( return; } + // FIXME: If we properly encode the 32-bit aligned register requirement for + // these DS_GWS instructions, this can be removed. + if (!FoldingImmLike && OpToFold.isReg() && ST->needsAlignedVGPRs()) { + unsigned Opc = UseMI->getOpcode(); + // Special case for DS_GWS instructions that only use 32 bits but hardware + // treats it as a 64 bit read. + if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR || + Opc == AMDGPU::DS_GWS_BARRIER) { + const TargetRegisterClass *RC = + TRI->getRegClassForReg(*MRI, OpToFold.getReg()); + assert(RC); + + const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC, + this](AMDGPU::OpName OpName) -> bool { + const MachineOperand *Op = TII->getNamedOperand(*UseMI, OpName); + if (Op != UseOp) + return true; + Register Reg = OpToFold.getReg(); + assert(!Reg.isPhysical()); + return TRI->getRegSizeInBits(*RC) > 32 && + !(TRI->getChannelFromSubReg(OpToFold.getSubReg()) & 1) && + TRI->isProperlyAlignedRC(*RC); + }; + + if (!isAlignedReg(AMDGPU::OpName::data0)) + return; + } + } + // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunities. The shrink operands pass // already does this. diff --git a/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir b/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir new file mode 100644 index 0000000000000..94038f17950cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir @@ -0,0 +1,390 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=si-fold-operands -o - %s | FileCheck %s +--- +name: v_readfirstlane_b32_omitted +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: v_readfirstlane_b32_omitted + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = IMPLICIT_DEF + %2:sgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %3, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec + %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1 + %8:sreg_64 = S_QUADMASK_B64 %7, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_omitted_switched_subregs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: v_readfirstlane_b32_omitted_switched_subregs + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub1, killed [[COPY1]], %subreg.sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = IMPLICIT_DEF + %2:sgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub1, killed %3, %subreg.sub0 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec + %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1 + %8:sreg_64 = S_QUADMASK_B64 killed %7, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_phys_vgpr_and_sgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $sgpr0 + + ; CHECK-LABEL: name: v_readfirstlane_b32_phys_vgpr_and_sgpr + ; CHECK: liveins: $vgpr0, $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = COPY $sgpr0 + %2:sgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = COPY %1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %3, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec + %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1 + %8:sreg_64 = S_QUADMASK_B64 %7, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_both_vgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: v_readfirstlane_b32_both_vgpr + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:sreg_32 = IMPLICIT_DEF + %3:sgpr_32 = IMPLICIT_DEF + %4:sgpr_32 = IMPLICIT_DEF + %5:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %6:sgpr_32 = V_READFIRSTLANE_B32 %5.sub0, implicit $exec + %7:sgpr_32 = V_READFIRSTLANE_B32 %5.sub1, implicit $exec + %8:sreg_64 = REG_SEQUENCE %6, %subreg.sub0, %7, %subreg.sub1 + %9:sreg_64 = S_QUADMASK_B64 %8, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_both_sgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: v_readfirstlane_b32_both_sgpr + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sreg_32 = IMPLICIT_DEF + %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 + %4:sreg_64 = S_QUADMASK_B64 %3, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY1]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_32 = COPY %0 + %3:sgpr_32 = IMPLICIT_DEF + %4:sgpr_32 = IMPLICIT_DEF + %5:sgpr_32 = IMPLICIT_DEF + %6:vgpr_32 = COPY %1 + %7:vreg_128 = REG_SEQUENCE killed %6, %subreg.sub1, %2, %subreg.sub0, %2, %subreg.sub2, %2, %subreg.sub3 + %8:vreg_64 = REG_SEQUENCE killed %7.sub3, %subreg.sub1, %7.sub1, %subreg.sub0 + %9:sgpr_32 = V_READFIRSTLANE_B32 %8.sub0, implicit $exec + %10:sgpr_32 = V_READFIRSTLANE_B32 %8.sub1, implicit $exec + %11:sreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 + %12:sreg_64 = S_QUADMASK_B64 killed %11, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[DEF]], %subreg.sub1, killed [[COPY1]], %subreg.sub0, killed [[COPY1]], %subreg.sub2, killed [[COPY1]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_32 = COPY $sgpr0 + %1:sreg_32 = IMPLICIT_DEF + %2:sgpr_32 = COPY %0 + %3:sgpr_128 = REG_SEQUENCE killed %1, %subreg.sub1, killed %2, %subreg.sub0, killed %2, %subreg.sub2, killed %2, %subreg.sub3 + %4:sreg_64 = REG_SEQUENCE killed %3.sub3, %subreg.sub1, %3.sub1, %subreg.sub0 + %5:sreg_64 = S_QUADMASK_B64 killed %4, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: v_readfirstlane_b32_undef_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: v_readfirstlane_b32_undef_subreg + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]] + ; CHECK-NEXT: S_ENDPGM 0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:sreg_32 = IMPLICIT_DEF + %5:sgpr_32 = IMPLICIT_DEF + %6:sgpr_32 = IMPLICIT_DEF + %7:vgpr_32 = COPY %3 + %8:vreg_128 = REG_SEQUENCE killed %7, %subreg.sub1, %1, %subreg.sub0, %0, %subreg.sub2 + %9:vreg_64 = REG_SEQUENCE killed %8.sub3, %subreg.sub1, %8.sub1, %subreg.sub0 + %10:sgpr_32 = V_READFIRSTLANE_B32 %9.sub0, implicit $exec + %11:sgpr_32 = V_READFIRSTLANE_B32 %9.sub1, implicit $exec + %12:sreg_64 = REG_SEQUENCE %10, %subreg.sub0, %11, %subreg.sub1 + %13:sreg_64 = S_QUADMASK_B64 killed %12, implicit-def $scc + S_ENDPGM 0 + +... +--- +name: fold_aligned_reg_into_required_aligned_reg +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: fold_aligned_reg_into_required_aligned_reg + ; CHECK: S_NOP 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:av_128_with_sub1_sub2_in_vreg_64_align2 = COPY undef renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub1_sub2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[DEF]], %subreg.sub2, [[DEF]], %subreg.sub3 + ; CHECK-NEXT: [[V_MFMA_F32_4X4X4F16_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_4X4X4F16_vgprcd_e64 [[COPY]].sub1_sub2, [[COPY]].sub1_sub2, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[V_MFMA_F32_4X4X4F16_vgprcd_e64_]], undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + S_NOP 0 + %0:vreg_128 = COPY undef renamable $sgpr0_sgpr1_sgpr2_sgpr3 + %1:vreg_64_align2 = COPY %0.sub1_sub2 + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_128_align2 = REG_SEQUENCE %1, %subreg.sub0_sub1, %2, %subreg.sub2, %2, %subreg.sub3 + %4:vreg_128_align2 = V_MFMA_F32_4X4X4F16_vgprcd_e64 %3.sub0_sub1, %3.sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %5, %4, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + S_ENDPGM 0 +... +--- +name: not_fold_into_ds_gws_align +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK-NEXT: $m0 = S_MOV_B32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; CHECK-NEXT: BUNDLE implicit [[REG_SEQUENCE]], implicit $m0, implicit $exec { + ; CHECK-NEXT: DS_GWS_INIT [[REG_SEQUENCE]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE]] :: (store (s32) into custom "GWSResource") + ; CHECK-NEXT: S_WAITCNT 0 + ; CHECK-NEXT: } + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY %0.sub1 + $m0 = S_MOV_B32 0 + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + BUNDLE implicit %3, implicit $m0, implicit $exec { + DS_GWS_INIT %3.sub0, 0, implicit $m0, implicit $exec, implicit %3 :: (store (s32) into custom "GWSResource") + S_WAITCNT 0 + } + S_ENDPGM 0 +... +--- +name: fold_into_ds_gws_align +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; CHECK-NEXT: $m0 = S_MOV_B32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; CHECK-NEXT: BUNDLE implicit [[REG_SEQUENCE]], implicit $m0, implicit $exec { + ; CHECK-NEXT: DS_GWS_INIT [[COPY]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE]] :: (store (s32) into custom "GWSResource") + ; CHECK-NEXT: S_WAITCNT 0 + ; CHECK-NEXT: } + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY %0.sub0 + $m0 = S_MOV_B32 0 + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1 + BUNDLE implicit %3, implicit $m0, implicit $exec { + DS_GWS_INIT %3.sub0, 0, implicit $m0, implicit $exec, implicit %3 :: (store (s32) into custom "GWSResource") + S_WAITCNT 0 + } + S_ENDPGM 0 +... +--- +name: tied_def_folding +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: tied_def + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[DEF]], implicit $exec, implicit [[COPY]](tied-def 0) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]] + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = COPY %1 + %3:vgpr_32 = V_ADD_U32_e32 undef %0, %2:vgpr_32, implicit $exec, implicit %2:vgpr_32(tied-def 0) + GLOBAL_STORE_DWORD_SADDR %0, %0, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + S_ENDPGM 0, implicit %3 +... +--- +name: tied_def_subreg_folding +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: tied_def_subreg + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 10, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1 + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[REG_SEQUENCE]].sub1, implicit $exec, implicit [[REG_SEQUENCE]].sub1(tied-def 0) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]] + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_64_align2 = V_MOV_B64_e32 10, implicit $exec + %2:vreg_64_align2 = COPY %1 + %3:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %2.sub1, %subreg.sub1 + %4:vgpr_32 = V_ADD_U32_e32 undef %0, %3.sub1, implicit $exec, implicit %3.sub1(tied-def 0) + GLOBAL_STORE_DWORD_SADDR %0, %0, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + S_ENDPGM 0, implicit %4 diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir index 3ac463b4fb448..e2b1857a81729 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir @@ -371,8 +371,6 @@ body: | # GCN-NEXT: %2:vgpr_32 = COPY %0 # GCN-NEXT: %3:vgpr_32 = COPY %1 # GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 -# GCN-NEXT: %5:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec -# GCN-NEXT: %6:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec --- name: fold-copy-readfirstlane-regsequence1 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index b9bf76c1423b6..7762b09abd72e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -260,3 +260,20 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp store i32 %sel, ptr addrspace(1) null, align 4 ret void } + +; See issue #125950 +define amdgpu_ps void @issue125950_test_quadmask_half_poison_i64(i32 %in, ptr %out) { +; GFX11-LABEL: issue125950_test_quadmask_half_poison_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: flat_store_b64 v[1:2], v[3:4] +; GFX11-NEXT: s_endpgm + %v1 = insertelement <2 x i32> , i32 %in, i32 0 + %v2 = bitcast <2 x i32> %v1 to i64 + %v3 = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %v2) + %p = inttoptr i64 %v2 to ptr addrspace(4) + store i64 %v3, ptr %out + ret void +}