diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index 3643b777db599e..220f99f7f43647 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -21,17 +21,312 @@ namespace llvm { namespace mca { +void AMDGPUInstrPostProcess::postProcessInstruction( + std::unique_ptr &Inst, const MCInst &MCI) { + switch (MCI.getOpcode()) { + case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_EXPCNT_gfx10: + case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: + case AMDGPU::S_WAITCNT_VMCNT_gfx10: + case AMDGPU::S_WAITCNT_VSCNT_gfx10: + case AMDGPU::S_WAITCNT_gfx10: + case AMDGPU::S_WAITCNT_gfx6_gfx7: + case AMDGPU::S_WAITCNT_vi: + return processWaitCnt(Inst, MCI); + } +} + +// s_waitcnt instructions encode important information as immediate operands +// which are lost during the MCInst -> mca::Instruction lowering. +void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr &Inst, + const MCInst &MCI) { + for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { + MCAOperand Op; + const MCOperand &MCOp = MCI.getOperand(Idx); + if (MCOp.isReg()) { + Op = MCAOperand::createReg(MCOp.getReg()); + } else if (MCOp.isImm()) { + Op = MCAOperand::createImm(MCOp.getImm()); + } + Op.setIndex(Idx); + Inst->addOperand(Op); + } +} + AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII) - : CustomBehaviour(STI, SrcMgr, MCII) {} + : CustomBehaviour(STI, SrcMgr, MCII) { + generateWaitCntInfo(); +} + +unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef IssuedInst, + const InstRef &IR) { + const Instruction &Inst = *IR.getInstruction(); + unsigned Opcode = Inst.getOpcode(); + + // llvm-mca is generally run on fully compiled assembly so we wouldn't see any + // pseudo instructions here. However, there are plans for the future to make + // it possible to use mca within backend passes. As such, I have left the + // pseudo version of s_waitcnt within this switch statement. + switch (Opcode) { + default: + return 0; + case AMDGPU::S_WAITCNT: // This instruction + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. + case AMDGPU::S_WAITCNT_EXPCNT_gfx10: + case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: + case AMDGPU::S_WAITCNT_VMCNT_gfx10: + case AMDGPU::S_WAITCNT_VSCNT_gfx10: + case AMDGPU::S_WAITCNT_gfx10: + case AMDGPU::S_WAITCNT_gfx6_gfx7: + case AMDGPU::S_WAITCNT_vi: + // s_endpgm also behaves as if there is an implicit + // s_waitcnt 0, but I'm not sure if it would be appropriate + // to model this in llvm-mca based on how the iterations work + // while simulating the pipeline over and over. + return handleWaitCnt(IssuedInst, IR); + } -unsigned -AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef IssuedInst, - const mca::InstRef &IR) { return 0; } +unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef IssuedInst, + const InstRef &IR) { + // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. + // I do not know how that instruction works so I did not attempt to model it. + // set the max values to begin + unsigned Vmcnt = 63; + unsigned Expcnt = 7; + unsigned Lgkmcnt = 31; + unsigned Vscnt = 63; + unsigned CurrVmcnt = 0; + unsigned CurrExpcnt = 0; + unsigned CurrLgkmcnt = 0; + unsigned CurrVscnt = 0; + unsigned CyclesToWaitVm = ~0U; + unsigned CyclesToWaitExp = ~0U; + unsigned CyclesToWaitLgkm = ~0U; + unsigned CyclesToWaitVs = ~0U; + + computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); + + // We will now look at each of the currently executing instructions + // to find out if this wait instruction still needs to wait. + for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { + const InstRef &PrevIR = *I; + const Instruction &PrevInst = *PrevIR.getInstruction(); + const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); + const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; + const int CyclesLeft = PrevInst.getCyclesLeft(); + assert(CyclesLeft != UNKNOWN_CYCLES && + "We should know how many cycles are left for this instruction"); + if (PrevInstWaitInfo.VmCnt) { + CurrVmcnt++; + if ((unsigned)CyclesLeft < CyclesToWaitVm) + CyclesToWaitVm = CyclesLeft; + } + if (PrevInstWaitInfo.ExpCnt) { + CurrExpcnt++; + if ((unsigned)CyclesLeft < CyclesToWaitExp) + CyclesToWaitExp = CyclesLeft; + } + if (PrevInstWaitInfo.LgkmCnt) { + CurrLgkmcnt++; + if ((unsigned)CyclesLeft < CyclesToWaitLgkm) + CyclesToWaitLgkm = CyclesLeft; + } + if (PrevInstWaitInfo.VsCnt) { + CurrVscnt++; + if ((unsigned)CyclesLeft < CyclesToWaitVs) + CyclesToWaitVs = CyclesLeft; + } + } + + unsigned CyclesToWait = ~0U; + if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) + CyclesToWait = CyclesToWaitVm; + if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) + CyclesToWait = CyclesToWaitExp; + if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) + CyclesToWait = CyclesToWaitLgkm; + if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) + CyclesToWait = CyclesToWaitVs; + + // We may underestimate how many cycles we need to wait, but this + // isn't a big deal. Our return value is just how many cycles until + // this function gets run again. So as long as we don't overestimate + // the wait time, we'll still end up stalling at this instruction + // for the correct number of cycles. + + if (CyclesToWait == ~0U) + return 0; + return CyclesToWait; +} + +void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, + unsigned &Expcnt, unsigned &Lgkmcnt, + unsigned &Vscnt) { + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); + const Instruction &Inst = *IR.getInstruction(); + unsigned Opcode = Inst.getOpcode(); + + switch (Opcode) { + case AMDGPU::S_WAITCNT_EXPCNT_gfx10: + case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: + case AMDGPU::S_WAITCNT_VMCNT_gfx10: + case AMDGPU::S_WAITCNT_VSCNT_gfx10: { + // Should probably be checking for nullptr + // here, but I'm not sure how I should handle the case + // where we see a nullptr. + const MCAOperand *OpReg = Inst.getOperand(0); + const MCAOperand *OpImm = Inst.getOperand(1); + assert(OpReg && OpReg->isReg() && "First operand should be a register."); + assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); + if (OpReg->getReg() != AMDGPU::SGPR_NULL) { + // Instruction is using a real register. + // Since we can't know what value this register will have, + // we can't compute what the value of this wait should be. + WithColor::warning() << "The register component of " + << MCII.getName(Opcode) << " will be completely " + << "ignored. So the wait may not be accurate.\n"; + } + switch (Opcode) { + // Redundant switch so I don't have to repeat the code above + // for each case. There are more clever ways to avoid this + // extra switch and anyone can feel free to implement one of them. + case AMDGPU::S_WAITCNT_EXPCNT_gfx10: + Expcnt = OpImm->getImm(); + break; + case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: + Lgkmcnt = OpImm->getImm(); + break; + case AMDGPU::S_WAITCNT_VMCNT_gfx10: + Vmcnt = OpImm->getImm(); + break; + case AMDGPU::S_WAITCNT_VSCNT_gfx10: + Vscnt = OpImm->getImm(); + break; + } + return; + } + case AMDGPU::S_WAITCNT_gfx10: + case AMDGPU::S_WAITCNT_gfx6_gfx7: + case AMDGPU::S_WAITCNT_vi: + unsigned WaitCnt = Inst.getOperand(0)->getImm(); + AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); + return; + } +} + +void AMDGPUCustomBehaviour::generateWaitCntInfo() { + // The core logic from this function is taken from + // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions + // that are being looked at are in the MachineInstr format, whereas we have + // access to the MCInst format. The side effects of this are that we can't use + // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) + // functions. Therefore, we conservatively assume that these functions will + // return true. This may cause a few instructions to be incorrectly tagged + // with an extra CNT. However, these are instructions that do interact with at + // least one CNT so giving them an extra CNT shouldn't cause issues in most + // scenarios. + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); + InstrWaitCntInfo.resize(SrcMgr.size()); + + int Index = 0; + for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { + const std::unique_ptr &Inst = *I; + unsigned Opcode = Inst->getOpcode(); + const MCInstrDesc &MCID = MCII.get(Opcode); + if ((MCID.TSFlags & SIInstrFlags::DS) && + (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { + InstrWaitCntInfo[Index].LgkmCnt = true; + if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) + InstrWaitCntInfo[Index].ExpCnt = true; + } else if (MCID.TSFlags & SIInstrFlags::FLAT) { + // We conservatively assume that mayAccessVMEMThroughFlat(Inst) + // and mayAccessLDSThroughFlat(Inst) would both return true for this + // instruction. We have to do this because those functions use + // information about the memory operands that we don't have access to. + InstrWaitCntInfo[Index].LgkmCnt = true; + if (!STI.hasFeature(AMDGPU::FeatureVscnt)) + InstrWaitCntInfo[Index].VmCnt = true; + else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) + InstrWaitCntInfo[Index].VmCnt = true; + else + InstrWaitCntInfo[Index].VsCnt = true; + } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { + if (!STI.hasFeature(AMDGPU::FeatureVscnt)) + InstrWaitCntInfo[Index].VmCnt = true; + else if ((MCID.mayLoad() && + !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || + ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && + !MCID.mayStore())) + InstrWaitCntInfo[Index].VmCnt = true; + else if (MCID.mayStore()) + InstrWaitCntInfo[Index].VsCnt = true; + + // (IV.Major < 7) is meant to represent + // GCNTarget.vmemWriteNeedsExpWaitcnt() + // which is defined as + // { return getGeneration() < SEA_ISLANDS; } + if (IV.Major < 7 && + (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) + InstrWaitCntInfo[Index].ExpCnt = true; + } else if (MCID.TSFlags & SIInstrFlags::SMRD) { + InstrWaitCntInfo[Index].LgkmCnt = true; + } else if (MCID.TSFlags & SIInstrFlags::EXP) { + InstrWaitCntInfo[Index].ExpCnt = true; + } else { + switch (Opcode) { + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: + case AMDGPU::S_MEMTIME: + case AMDGPU::S_MEMREALTIME: + InstrWaitCntInfo[Index].LgkmCnt = true; + break; + } + } + } +} + +// taken from SIInstrInfo::isVMEM() +bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { + return MCID.TSFlags & SIInstrFlags::MUBUF || + MCID.TSFlags & SIInstrFlags::MTBUF || + MCID.TSFlags & SIInstrFlags::MIMG; +} + +// taken from SIInstrInfo::hasModifiersSet() +bool AMDGPUCustomBehaviour::hasModifiersSet( + const std::unique_ptr &Inst, unsigned OpName) const { + int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); + if (Idx == -1) + return false; + + const MCAOperand *Op = Inst->getOperand(Idx); + if (Op == nullptr || !Op->isImm() || !Op->getImm()) + return false; + + return true; +} + +// taken from SIInstrInfo::isAlwaysGDS() +bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { + return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || + Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || + Opcode == AMDGPU::DS_GWS_SEMA_P || + Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || + Opcode == AMDGPU::DS_GWS_BARRIER; +} + } // namespace mca } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index daefad28699c22..728c5455ff49c2 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -25,23 +25,68 @@ namespace llvm { namespace mca { class AMDGPUInstrPostProcess : public InstrPostProcess { + void processWaitCnt(std::unique_ptr &Inst, const MCInst &MCI); + public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} ~AMDGPUInstrPostProcess() {} - void postProcessInstruction(std::unique_ptr &Inst, - const MCInst &MCI) override {} + void postProcessInstruction(std::unique_ptr &Inst, + const MCInst &MCI) override; +}; + +struct WaitCntInfo { + bool VmCnt = false; + bool ExpCnt = false; + bool LgkmCnt = false; + bool VsCnt = false; }; class AMDGPUCustomBehaviour : public CustomBehaviour { + /// Whenever MCA would like to dispatch an s_waitcnt instructions, + /// we must check all the instruction that are still executing to see if + /// they modify the same CNT as we need to wait for. This vector + /// gets built in the constructor and contains 1 WaitCntInfo struct + /// for each instruction within the SrcManager. Each element + /// tells us which CNTs that instruction may interact with. + /// We conservatively assume some instructions interact with more + /// CNTs than they do in reality, so we will occasionally wait + /// longer than necessary, but we shouldn't ever wait for shorter. + std::vector InstrWaitCntInfo; + + /// This method gets called from the constructor and is + /// where we setup the InstrWaitCntInfo vector. + /// The core logic for determining which CNTs an instruction + /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter(). + /// Unfortunately, some of the logic from that function is not avalable to us + /// in this scope so we conservatively end up assuming that some + /// instructions interact with more CNTs than they do in reality. + void generateWaitCntInfo(); + /// Helper function used in generateWaitCntInfo() + bool hasModifiersSet(const std::unique_ptr &Inst, + unsigned OpName) const; + /// Helper function used in generateWaitCntInfo() + bool isAlwaysGDS(uint16_t Opcode) const; + /// Helper function used in generateWaitCntInfo() + bool isVMEM(const MCInstrDesc &MCID); + /// This method gets called from checkCustomHazard when mca is attempting to + /// dispatch an s_waitcnt instruction (or one of its variants). The method + /// looks at each of the instructions that are still executing in the pipeline + /// to determine if the waitcnt should force a wait. + unsigned handleWaitCnt(ArrayRef IssuedInst, const InstRef &IR); + /// Based on the type of s_waitcnt instruction we are looking at, and what its + /// operands are, this method will set the values for each of the cnt + /// references provided as arguments. + void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt, + unsigned &Lgkmcnt, unsigned &Vscnt); + public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); ~AMDGPUCustomBehaviour() {} - /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. @@ -49,10 +94,9 @@ class AMDGPUCustomBehaviour : public CustomBehaviour { /// register and hardware dependencies so this method should only /// implement custom behaviour and dependencies that are not picked up /// by MCA naturally. - unsigned checkCustomHazard(ArrayRef IssuedInst, - const mca::InstRef &IR) override; + unsigned checkCustomHazard(ArrayRef IssuedInst, + const InstRef &IR) override; }; - } // namespace mca } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index b24c061af7ab79..0792b303b83097 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -137,6 +137,7 @@ def MIReadVGPR : SchedReadVariant<[ // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { + let RetireOOO = 1 in { // llvm-mca specific flag def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; // Can be between 2 and 64 @@ -159,6 +160,7 @@ multiclass SICommonWriteRes { def : HWWriteRes; let ResourceCycles = [16] in def : HWWriteRes; + } // End RetireOOO = 1 def : ReadAdvance; @@ -182,6 +184,7 @@ let SchedModel = SIFullSpeedModel in { defm : SICommonWriteRes; +let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -189,6 +192,7 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; @@ -198,6 +202,7 @@ let SchedModel = SIQuarterSpeedModel in { defm : SICommonWriteRes; +let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -205,6 +210,7 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; @@ -218,6 +224,7 @@ let SchedModel = SIDPFullSpeedModel in { defm : SICommonWriteRes; +let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -225,6 +232,7 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; @@ -240,6 +248,7 @@ let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). // Add 1 stall cycle for VGPR read. +let RetireOOO = 1 in { // llvm-mca specific flag def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; @@ -259,6 +268,7 @@ def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; +} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s index 0ffdad05cfa67e..00b429ef6d67d5 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s @@ -41,12 +41,12 @@ v_sqrt_f64 v[4:5], v[4:5] # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 28 -# CHECK-NEXT: Total Cycles: 224 +# CHECK-NEXT: Total Cycles: 205 # CHECK-NEXT: Total uOps: 29 # CHECK: Dispatch Width: 1 -# CHECK-NEXT: uOps Per Cycle: 0.13 -# CHECK-NEXT: IPC: 0.13 +# CHECK-NEXT: uOps Per Cycle: 0.14 +# CHECK-NEXT: IPC: 0.14 # CHECK-NEXT: Block RThroughput: 29.0 # CHECK: Instruction Info: @@ -133,37 +133,37 @@ v_sqrt_f64 v[4:5], v[4:5] # CHECK-NEXT: - - - 1.00 - 1.00 1.00 - v_sqrt_f64_e32 v[4:5], v[4:5] # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123 - -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 -# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] -# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 -# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] -# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 -# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7] -# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3] -# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5] -# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7] -# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9] -# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] -# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 -# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 01234 + +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 +# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] +# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 +# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] +# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 +# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7] +# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3] +# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5] +# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7] +# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9] +# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] +# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 +# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s new file mode 100644 index 00000000000000..939d3b06201318 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s @@ -0,0 +1,233 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx900 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s + +s_load_dwordx2 s[2:3], s[0:1], 0x24 +s_load_dwordx2 s[0:1], s[0:1], 0x2c +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v0, s2 +v_mov_b32_e32 v1, s3 +flat_load_dword v2, v[0:1] +flat_load_dword v3, v[0:1] offset:8 +flat_load_dword v4, v[0:1] offset:16 +flat_load_dword v5, v[0:1] offset:24 +v_mov_b32_e32 v0, s0 +v_mov_b32_e32 v1, s1 +v_mov_b32_e32 v6, s6 +v_mov_b32_e32 v7, s7 +v_mov_b32_e32 v8, s8 +v_mov_b32_e32 v9, s9 +v_mov_b32_e32 v10, s10 +v_mov_b32_e32 v11, s11 +v_mov_b32_e32 v12, s12 +v_mov_b32_e32 v13, s13 +v_mov_b32_e32 v14, s14 +v_mov_b32_e32 v15, s15 +v_mov_b32_e32 v16, s16 +v_mov_b32_e32 v17, s17 +v_mov_b32_e32 v18, s18 +v_mov_b32_e32 v19, s19 +v_mov_b32_e32 v20, s20 +v_mov_b32_e32 v21, s21 +v_mov_b32_e32 v22, s22 +v_mov_b32_e32 v23, s23 +v_mov_b32_e32 v24, s24 +v_mov_b32_e32 v25, s25 +v_mov_b32_e32 v26, s26 +v_mov_b32_e32 v27, s27 +v_mov_b32_e32 v28, s28 +v_mov_b32_e32 v29, s29 +s_waitcnt vmcnt(0) lgkmcnt(0) + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 36 +# CHECK-NEXT: Total Cycles: 331 +# CHECK-NEXT: Total uOps: 36 + +# CHECK: Dispatch Width: 1 +# CHECK-NEXT: uOps Per Cycle: 0.11 +# CHECK-NEXT: IPC: 0.11 +# CHECK-NEXT: Block RThroughput: 36.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[2:3], s[0:1], 0x24 +# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[0:1], s[0:1], 0x2c +# CHECK-NEXT: 1 1 1.00 U s_waitcnt lgkmcnt(0) +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s2 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s3 +# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v2, v[0:1] +# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v3, v[0:1] offset:8 +# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v4, v[0:1] offset:16 +# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v5, v[0:1] offset:24 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s0 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s1 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v6, s6 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v7, s7 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v8, s8 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v9, s9 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v10, s10 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v11, s11 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v12, s12 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v13, s13 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v14, s14 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v15, s15 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v16, s16 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v17, s17 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v18, s18 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v19, s19 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v20, s20 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v21, s21 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v22, s22 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v23, s23 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v24, s24 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v25, s25 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v26, s26 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v27, s27 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v28, s28 +# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v29, s29 +# CHECK-NEXT: 1 1 1.00 U s_waitcnt vmcnt(0) lgkmcnt(0) + +# CHECK: Resources: +# CHECK-NEXT: [0] - HWBranch +# CHECK-NEXT: [1] - HWExport +# CHECK-NEXT: [2] - HWLGKM +# CHECK-NEXT: [3] - HWSALU +# CHECK-NEXT: [4] - HWVALU +# CHECK-NEXT: [5] - HWVMEM +# CHECK-NEXT: [6] - HWXDL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] +# CHECK-NEXT: - - 2.00 2.00 28.00 4.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[2:3], s[0:1], 0x24 +# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[0:1], s[0:1], 0x2c +# CHECK-NEXT: - - - 1.00 - - - s_waitcnt lgkmcnt(0) +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s2 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s3 +# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v2, v[0:1] +# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v3, v[0:1] offset:8 +# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v4, v[0:1] offset:16 +# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v5, v[0:1] offset:24 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s0 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s1 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v6, s6 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v7, s7 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v8, s8 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v9, s9 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v10, s10 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v11, s11 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v12, s12 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v13, s13 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v14, s14 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v15, s15 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v16, s16 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v17, s17 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v18, s18 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v19, s19 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v20, s20 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v21, s21 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v22, s22 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v23, s23 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v24, s24 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v25, s25 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v26, s26 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v27, s27 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v28, s28 +# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v29, s29 +# CHECK-NEXT: - - - 1.00 - - - s_waitcnt vmcnt(0) lgkmcnt(0) + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 + +# CHECK: [0,0] DeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[2:3], s[0:1], 0x24 +# CHECK-NEXT: [0,1] .DeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[0:1], s[0:1], 0x2c +# CHECK-NEXT: [0,2] . .DE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_waitcnt lgkmcnt(0) +# CHECK-NEXT: [0,3] . . DE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s2 +# CHECK-NEXT: [0,4] . . DE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s3 +# CHECK-NEXT: [0,5] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . flat_load_dword v2, v[0:1] +# CHECK-NEXT: [0,6] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . flat_load_dword v3, v[0:1] offset:8 +# CHECK-NEXT: [0,7] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . flat_load_dword v4, v[0:1] offset:16 +# CHECK-NEXT: [0,8] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. flat_load_dword v5, v[0:1] offset:24 +# CHECK-NEXT: [0,9] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s0 +# CHECK-NEXT: [0,10] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s1 +# CHECK-NEXT: [0,11] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v6, s6 +# CHECK-NEXT: [0,12] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . . . v_mov_b32_e32 v7, s7 +# CHECK-NEXT: [0,13] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v8, s8 +# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v9, s9 +# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . . v_mov_b32_e32 v10, s10 +# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v11, s11 +# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . . v_mov_b32_e32 v12, s12 +# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v13, s13 +# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v14, s14 +# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . v_mov_b32_e32 v15, s15 +# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v16, s16 +# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . v_mov_b32_e32 v17, s17 +# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v18, s18 +# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v19, s19 +# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . v_mov_b32_e32 v20, s20 +# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v21, s21 +# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . v_mov_b32_e32 v22, s22 +# CHECK-NEXT: [0,28] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v23, s23 +# CHECK-NEXT: [0,29] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v24, s24 +# CHECK-NEXT: [0,30] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . v_mov_b32_e32 v25, s25 +# CHECK-NEXT: [0,31] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v26, s26 +# CHECK-NEXT: [0,32] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . v_mov_b32_e32 v27, s27 +# CHECK-NEXT: [0,33] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . v_mov_b32_e32 v28, s28 +# CHECK-NEXT: [0,34] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . v_mov_b32_e32 v29, s29 +# CHECK-NEXT: [0,35] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE s_waitcnt vmcnt(0) lgkmcnt(0) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 0.0 0.0 0.0 s_load_dwordx2 s[2:3], s[0:1], 0x24 +# CHECK-NEXT: 1. 1 0.0 0.0 0.0 s_load_dwordx2 s[0:1], s[0:1], 0x2c +# CHECK-NEXT: 2. 1 0.0 0.0 0.0 s_waitcnt lgkmcnt(0) +# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s2 +# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s3 +# CHECK-NEXT: 5. 1 0.0 0.0 0.0 flat_load_dword v2, v[0:1] +# CHECK-NEXT: 6. 1 0.0 0.0 0.0 flat_load_dword v3, v[0:1] offset:8 +# CHECK-NEXT: 7. 1 0.0 0.0 0.0 flat_load_dword v4, v[0:1] offset:16 +# CHECK-NEXT: 8. 1 0.0 0.0 0.0 flat_load_dword v5, v[0:1] offset:24 +# CHECK-NEXT: 9. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s0 +# CHECK-NEXT: 10. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s1 +# CHECK-NEXT: 11. 1 0.0 0.0 0.0 v_mov_b32_e32 v6, s6 +# CHECK-NEXT: 12. 1 0.0 0.0 0.0 v_mov_b32_e32 v7, s7 +# CHECK-NEXT: 13. 1 0.0 0.0 0.0 v_mov_b32_e32 v8, s8 +# CHECK-NEXT: 14. 1 0.0 0.0 0.0 v_mov_b32_e32 v9, s9 +# CHECK-NEXT: 15. 1 0.0 0.0 0.0 v_mov_b32_e32 v10, s10 +# CHECK-NEXT: 16. 1 0.0 0.0 0.0 v_mov_b32_e32 v11, s11 +# CHECK-NEXT: 17. 1 0.0 0.0 0.0 v_mov_b32_e32 v12, s12 +# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_mov_b32_e32 v13, s13 +# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_mov_b32_e32 v14, s14 +# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_mov_b32_e32 v15, s15 +# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_mov_b32_e32 v16, s16 +# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_mov_b32_e32 v17, s17 +# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_mov_b32_e32 v18, s18 +# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_mov_b32_e32 v19, s19 +# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_mov_b32_e32 v20, s20 +# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_mov_b32_e32 v21, s21 +# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_mov_b32_e32 v22, s22 +# CHECK-NEXT: 28. 1 0.0 0.0 0.0 v_mov_b32_e32 v23, s23 +# CHECK-NEXT: 29. 1 0.0 0.0 0.0 v_mov_b32_e32 v24, s24 +# CHECK-NEXT: 30. 1 0.0 0.0 0.0 v_mov_b32_e32 v25, s25 +# CHECK-NEXT: 31. 1 0.0 0.0 0.0 v_mov_b32_e32 v26, s26 +# CHECK-NEXT: 32. 1 0.0 0.0 0.0 v_mov_b32_e32 v27, s27 +# CHECK-NEXT: 33. 1 0.0 0.0 0.0 v_mov_b32_e32 v28, s28 +# CHECK-NEXT: 34. 1 0.0 0.0 0.0 v_mov_b32_e32 v29, s29 +# CHECK-NEXT: 35. 1 0.0 0.0 0.0 s_waitcnt vmcnt(0) lgkmcnt(0) +# CHECK-NEXT: 1 0.0 0.0 0.0