diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 1c20805e2e82b0..4bff5b9ac45aaa 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -471,26 +471,30 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, if (!TRI.isVGPR(MRI, X)) return nullptr; - for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { - if (YTop.getSubReg() != Tsub) + const unsigned SearchLimit = 16; + unsigned Count = 0; + for (auto Iter = std::next(MovT.getIterator()), + E = MovT.getParent()->instr_end(); + Iter != E && Count < SearchLimit; ++Iter, ++Count) { + + MachineInstr *MovY = &*Iter; + if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && + MovY->getOpcode() != AMDGPU::COPY) || + !MovY->getOperand(1).isReg() || + MovY->getOperand(1).getReg() != T || + MovY->getOperand(1).getSubReg() != Tsub) continue; - MachineInstr &MovY = *YTop.getParent(); - if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && - MovY.getOpcode() != AMDGPU::COPY) || - MovY.getOperand(1).getSubReg() != Tsub) - continue; - - Register Y = MovY.getOperand(0).getReg(); - unsigned Ysub = MovY.getOperand(0).getSubReg(); + Register Y = MovY->getOperand(0).getReg(); + unsigned Ysub = MovY->getOperand(0).getSubReg(); - if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) + if (!TRI.isVGPR(MRI, Y)) continue; MachineInstr *MovX = nullptr; - auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); - for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { - if (instReadsReg(&*I, X, Xsub, TRI) || + for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); + I != IY; ++I) { + if (instReadsReg(&*I, X, Xsub, TRI) || instModifiesReg(&*I, Y, Ysub, TRI) || instModifiesReg(&*I, T, Tsub, TRI) || (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { @@ -515,7 +519,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, MovX = &*I; } - if (!MovX || I == E) + if (!MovX) continue; LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); @@ -532,7 +536,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, .addReg(X1.Reg, 0, X1.SubReg).getInstr(); } MovX->eraseFromParent(); - MovY.eraseFromParent(); + MovY->eraseFromParent(); MachineInstr *Next = &*std::next(MovT.getIterator()); if (MRI.use_nodbg_empty(T)) MovT.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir index 6e3aaa98b1f083..9f36e0b5d68543 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir @@ -562,3 +562,113 @@ body: | %1.sub0 = COPY %2.sub0 S_ENDPGM 0 ... + +# GCN-LABEL: name: swap_exact_max_insns_apart +# GCN: bb.0: +# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %3:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec +# GCN-NEXT: S_ENDPGM 0 + +--- +name: swap_exact_max_insns_apart +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = COPY %0 + %3 = IMPLICIT_DEF + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %0 = COPY %1 + %1 = COPY %2 + S_ENDPGM 0 +... + +# GCN-LABEL: name: swap_too_far +# GCN: bb.0: +# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %2:vgpr_32 = COPY %0 +# GCN-NEXT: %3:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %4:vgpr_32 = COPY %3 +# GCN-NEXT: %3:vgpr_32 = COPY %4 +# GCN-NEXT: %0:vgpr_32 = COPY %1 +# GCN-NEXT: %1:vgpr_32 = COPY %2 +# GCN-NEXT: S_ENDPGM 0 + +--- +name: swap_too_far +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = COPY %0 + %3 = IMPLICIT_DEF + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %4 = COPY %3 + %3 = COPY %4 + %0 = COPY %1 + %1 = COPY %2 + S_ENDPGM 0 +...