diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7d457edad0d5c..20bae0f27647a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4727,8 +4727,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const SIRegisterInfo *TRI = ST.getRegisterInfo(); Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); const DebugLoc &DL = MI.getDebugLoc(); Register SrcCond = MI.getOperand(3).getReg(); @@ -4737,20 +4737,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); Register SrcCondCopy = MRI.createVirtualRegister(CondRC); + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub0) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub0) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub0) + .addImm(0) + .add(Src1Sub0) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub1) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub1) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub1) + .addImm(0) + .add(Src1Sub1) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) .addReg(DstLo) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 327f8988ac2f1..f1d1fd386d1b2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7223,27 +7223,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineOperand &Src1 = Inst.getOperand(2); MachineOperand &Cond = Inst.getOperand(3); - Register SCCSource = Cond.getReg(); - bool IsSCC = (SCCSource == AMDGPU::SCC); + Register CondReg = Cond.getReg(); + bool IsSCC = (CondReg == AMDGPU::SCC); // If this is a trivial select where the condition is effectively not SCC - // (SCCSource is a source of copy to SCC), then the select is semantically - // equivalent to copying SCCSource. Hence, there is no need to create + // (CondReg is a source of copy to SCC), then the select is semantically + // equivalent to copying CondReg. Hence, there is no need to create // V_CNDMASK, we can just use that and bail out. if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && (Src1.getImm() == 0)) { - MRI.replaceRegWith(Dest.getReg(), SCCSource); + MRI.replaceRegWith(Dest.getReg(), CondReg); return; } - const TargetRegisterClass *TC = - RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register CopySCC = MRI.createVirtualRegister(TC); - + Register NewCondReg = CondReg; if (IsSCC) { + const TargetRegisterClass *TC = + RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + NewCondReg = MRI.createVirtualRegister(TC); + // Now look for the closest SCC def if it is a copy - // replacing the SCCSource with the COPY source register + // replacing the CondReg with the COPY source register bool CopyFound = false; for (MachineInstr &CandI : make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), @@ -7251,7 +7251,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) { if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { - BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg) .addReg(CandI.getOperand(1).getReg()); CopyFound = true; } @@ -7266,24 +7266,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; auto NewSelect = - BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0); NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); } } - Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - auto UpdatedInst = - BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) - .addImm(0) - .add(Src1) // False - .addImm(0) - .add(Src0) // True - .addReg(IsSCC ? CopySCC : SCCSource); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - legalizeOperands(*UpdatedInst, MDT); - addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + Register NewDestReg = MRI.createVirtualRegister( + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()))); + MachineInstr *NewInst; + if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) { + NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(NewCondReg); + } else { + NewInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg) + .add(Src1) // False + .add(Src0) // True + .addReg(NewCondReg); + } + MRI.replaceRegWith(Dest.getReg(), NewDestReg); + legalizeOperands(*NewInst, MDT); + addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist); } void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll new file mode 100644 index 0000000000000..ac196635b363a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s + +define amdgpu_cs <2 x i32> @f() { +; CHECK-LABEL: f: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CHECK-NEXT: s_mov_b32 s5, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; CHECK-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; CHECK-NEXT: ; return to shader part epilog +bb: + %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + %i1 = bitcast <2 x i32> %i to i64 + %i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2 + %i3 = icmp ne <3 x i64> %i2, zeroinitializer + %i4 = zext <3 x i1> %i3 to <3 x i64> + %i5 = bitcast <3 x i64> %i4 to <6 x i32> + %i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + ret <2 x i32> %i6 +} + +declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir index 8d0a9899b5dbc..b1715bd9d3328 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir @@ -233,3 +233,22 @@ body: | %0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec %1:sreg_32 = COPY %0.sub0 ... + +--- +# GCN-LABEL: name: s_cselect_b64 +# GCN: %0:vgpr_32 = IMPLICIT_DEF +# GCN: %1:vreg_64 = IMPLICIT_DEF +# GCN: %2:sreg_32 = IMPLICIT_DEF +# GCN: %3:sreg_64 = IMPLICIT_DEF +# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec +# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec +name: s_cselect_b64 +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + %3:sreg_64 = COPY %1 + S_CMP_EQ_U32 %2, 0, implicit-def $scc + %4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc +...