diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 6f1a5210fb7e0..52abcac6cea5a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2217,16 +2217,33 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) return false; - MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); - bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); - bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); - bool Overlapped = OverlappedSrc || OverlappedDst; - - assert(!OverlappedDst || !OverlappedSrc || - Src1->getReg() == MI->getOperand(0).getReg()); assert(ST.needsAlignedVGPRs()); static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); + const DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); + + // In: + // + // Dst = shiftrev64 Amt, Src1 + // + // if Dst!=Src1 then avoid the bug with: + // + // Dst.sub0 = Amt + // Dst = shift64 Dst.sub0, Src1 + + Register DstReg = MI->getOperand(0).getReg(); + if (!Src1->isReg() || Src1->getReg() != DstReg) { + Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0); + runOnInstruction( + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt)); + Amt->setReg(DstLo); + Amt->setIsKill(true); + return true; + } + + bool Overlapped = MI->modifiesRegister(AmtReg, &TRI); Register NewReg; for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass : AMDGPU::VGPR_32RegClass) { @@ -2243,8 +2260,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (Overlapped) NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); // Insert a full wait count because found register might be pending a wait. BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) .addImm(0); @@ -2282,9 +2297,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { Amt->setIsKill(false); // We do not update liveness, so verifier may see it as undef. Amt->setIsUndef(); - if (OverlappedDst) + if (Overlapped) { MI->getOperand(0).setReg(NewReg); - if (OverlappedSrc) { Src1->setReg(NewReg); Src1->setIsKill(false); Src1->setIsUndef(); diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir index 616fef1117eb2..74c641bebb78b 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir @@ -64,10 +64,8 @@ body: | ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst ; GCN: $vgpr7 = IMPLICIT_DEF ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr2_vgpr3, implicit $exec - ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr2_vgpr3, implicit $exec $vgpr7 = IMPLICIT_DEF $vgpr2_vgpr3 = IMPLICIT_DEF renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec @@ -82,10 +80,8 @@ body: | ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src ; GCN: $vgpr7 = IMPLICIT_DEF ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec $vgpr7 = IMPLICIT_DEF $vgpr0_vgpr1 = IMPLICIT_DEF renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec @@ -118,12 +114,8 @@ body: | ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src ; GCN: $vgpr7 = IMPLICIT_DEF ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec - ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec - ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 undef $vgpr3, undef $vgpr2_vgpr3, implicit $exec - ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec - ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr0, killed $vgpr6_vgpr7, implicit $exec $vgpr7 = IMPLICIT_DEF $vgpr6_vgpr7 = IMPLICIT_DEF renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec @@ -138,12 +130,8 @@ body: | ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst ; GCN: $vgpr7 = IMPLICIT_DEF ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 undef $vgpr6, undef $vgpr2, implicit $exec - ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr3, implicit $exec - ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr3, killed $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec - ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr6, killed $vgpr0_vgpr1, implicit $exec $vgpr7 = IMPLICIT_DEF $vgpr0_vgpr1 = IMPLICIT_DEF renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec @@ -179,11 +167,8 @@ body: | ; GCN: $vgpr7 = IMPLICIT_DEF ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF ; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: S_NOP 4 - ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 undef $vgpr7, undef $vgpr4, implicit $exec - ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 undef $vgpr4, killed $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr2, killed $vgpr0_vgpr1, implicit $exec $vgpr7 = IMPLICIT_DEF $vgpr0_vgpr1 = IMPLICIT_DEF $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec