diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 5a2aee2fa7643..e2866d21ef580 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -103,6 +103,7 @@ namespace impl_detail { // FIXME: Remove these declarations once RegisterClassInfo is queryable as an // analysis. class MachineSchedulerImpl; +class SSAMachineSchedulerImpl; class PostMachineSchedulerImpl; } // namespace impl_detail @@ -1464,6 +1465,20 @@ class MachineSchedulerPass : public PassInfoMixin { MachineFunctionAnalysisManager &MFAM); }; +class SSAMachineSchedulerPass : public PassInfoMixin { + // FIXME: Remove this member once RegisterClassInfo is queryable as an + // analysis. + std::unique_ptr Impl; + const TargetMachine *TM; + +public: + SSAMachineSchedulerPass(const TargetMachine *TM); + SSAMachineSchedulerPass(SSAMachineSchedulerPass &&Other); + ~SSAMachineSchedulerPass(); + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + class PostMachineSchedulerPass : public PassInfoMixin { // FIXME: Remove this member once RegisterClassInfo is queryable as an diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index f17d550623efc..2c30ac21446f5 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -165,6 +165,9 @@ LLVM_ABI extern char &MachineSchedulerID; /// PostMachineScheduler - This pass schedules machine instructions postRA. LLVM_ABI extern char &PostMachineSchedulerID; +/// SSAMachineScheduler - This pass schedules machine instructions in SSA. +LLVM_ABI extern char &SSAMachineSchedulerID; + /// SpillPlacement analysis. Suggest optimal placement of spill code between /// basic blocks. LLVM_ABI extern char &SpillPlacementID; diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h index 5e0e641a981f9..1bf8cfc639ff7 100644 --- a/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -135,6 +135,10 @@ class LLVM_ABI TargetPassConfig : public ImmutablePass { /// replace a copy. bool EnableSinkAndFold = false; + /// Enable insertion of SSAMachineScheduler pass, this triggers early + /// computation of live intervals. + bool EnableSSAMachineScheduler = false; + /// Require processing of functions such that callees are generated before /// callers. bool RequireCodeGenSCCOrder = false; @@ -205,6 +209,13 @@ class LLVM_ABI TargetPassConfig : public ImmutablePass { setOpt(RequireCodeGenSCCOrder, Enable); } + bool getEnableSSAMachineScheduler() const { + return EnableSSAMachineScheduler; + } + void setEnableSSAMachineScheduler(bool Enable) { + setOpt(EnableSSAMachineScheduler, Enable); + } + /// Allow the target to override a specific pass without overriding the pass /// pipeline. When passes are added to the standard pipeline at the /// point where StandardID is expected, add TargetID in its place. diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index a8c7a8aff83cf..fd2c4a0d13a36 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -220,6 +220,10 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo { /// allocation. virtual bool enablePostRAMachineScheduler() const; + /// True if the subtarget should run a machine scheduler before PHI + /// elimination. + virtual bool enableSSAMachineScheduler() const; + /// True if the subtarget should run the atomic expansion pass. virtual bool enableAtomicExpand() const; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 88272f053c114..c2f2765854945 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -289,6 +289,7 @@ LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &); LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &); LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &); LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &); +LLVM_ABI void initializeSSAMachineSchedulerPass(PassRegistry &); LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &); LLVM_ABI void initializeSafepointIRVerifierPass(PassRegistry &); LLVM_ABI void initializeSelectOptimizePass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 04a0da06fb6ec..bef95af375664 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -122,6 +122,7 @@ MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass()) MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadataPass()) MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass(TM)) +MACHINE_FUNCTION_PASS("ssa-machine-scheduler", SSAMachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass()) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 9e0cb3bf44906..cf81a0d240004 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -120,6 +120,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRemoveLoadsIntoFakeUsesLegacyPass(Registry); initializeRemoveRedundantDebugValuesLegacyPass(Registry); initializeRenameIndependentSubregsLegacyPass(Registry); + initializeSSAMachineSchedulerPass(Registry); initializeSafeStackLegacyPassPass(Registry); initializeSelectOptimizePass(Registry); initializeShadowStackGCLoweringPass(Registry); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 299bcc46e4bd2..db216d3d42b77 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -350,6 +350,33 @@ class MachineSchedulerImpl : public MachineSchedulerBase { ScheduleDAGInstrs *createMachineScheduler(); }; +/// Impl class for SSAMachineScheduler. +class SSAMachineSchedulerImpl : public MachineSchedulerBase { + // These are only for using MF.verify() + // remove when verify supports passing in all analyses + MachineFunctionPass *P = nullptr; + MachineFunctionAnalysisManager *MFAM = nullptr; + +public: + struct RequiredAnalyses { + MachineLoopInfo &MLI; + MachineDominatorTree &MDT; + AAResults &AA; + LiveIntervals &LIS; + }; + + SSAMachineSchedulerImpl() {} + // Migration only + void setLegacyPass(MachineFunctionPass *P) { this->P = P; } + void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } + + bool run(MachineFunction &MF, const TargetMachine &TM, + const RequiredAnalyses &Analyses); + +protected: + ScheduleDAGInstrs *createMachineScheduler(); +}; + /// Impl class for PostMachineScheduler. class PostMachineSchedulerImpl : public MachineSchedulerBase { // These are only for using MF.verify() @@ -380,6 +407,7 @@ class PostMachineSchedulerImpl : public MachineSchedulerBase { using impl_detail::MachineSchedulerBase; using impl_detail::MachineSchedulerImpl; using impl_detail::PostMachineSchedulerImpl; +using impl_detail::SSAMachineSchedulerImpl; namespace { /// MachineScheduler runs after coalescing and before register allocation. @@ -394,6 +422,18 @@ class MachineSchedulerLegacy : public MachineFunctionPass { static char ID; // Class identification, replacement for typeinfo }; +/// SSAMachineScheduler runs before PHI elimination. +class SSAMachineScheduler : public MachineFunctionPass { + SSAMachineSchedulerImpl Impl; + +public: + SSAMachineScheduler(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &) override; + + static char ID; // Class identification, replacement for typeinfo +}; + /// PostMachineScheduler runs after shortly before code emission. class PostMachineSchedulerLegacy : public MachineFunctionPass { PostMachineSchedulerImpl Impl; @@ -439,6 +479,35 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } +char SSAMachineScheduler::ID = 0; + +char &llvm::SSAMachineSchedulerID = SSAMachineScheduler::ID; + +INITIALIZE_PASS_BEGIN(SSAMachineScheduler, "ssa-machine-scheduler", + "SSA Machine Instruction Scheduler", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(SSAMachineScheduler, "ssa-machine-scheduler", + "SSA Machine Instruction Scheduler", false, false) + +SSAMachineScheduler::SSAMachineScheduler() : MachineFunctionPass(ID) { + initializeSSAMachineSchedulerPass(*PassRegistry::getPassRegistry()); +} + +void SSAMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} + char PostMachineSchedulerLegacy::ID = 0; char &llvm::PostMachineSchedulerID = PostMachineSchedulerLegacy::ID; @@ -490,6 +559,11 @@ static cl::opt EnableMachineSched( cl::desc("Enable the machine instruction scheduling pass."), cl::init(true), cl::Hidden); +static cl::opt EnableSSAMachineSched( + "enable-ssa-misched", + cl::desc("Enable the machine instruction scheduling pass in SSA."), + cl::init(false), cl::Hidden); + static cl::opt EnablePostRAMachineSched( "enable-post-misched", cl::desc("Enable the post-ra machine instruction scheduling pass."), @@ -586,6 +660,53 @@ bool MachineSchedulerImpl::run(MachineFunction &Func, const TargetMachine &TM, return true; } +/// Instantiate a ScheduleDAGInstrs that will be owned by the caller. +ScheduleDAGInstrs *SSAMachineSchedulerImpl::createMachineScheduler() { + // Get the default scheduler set by the target for this function. + ScheduleDAGInstrs *Scheduler = TM->createMachineScheduler(this); + if (Scheduler) + return Scheduler; + + // Default to GenericScheduler. + return createSchedLive(this); +} + +bool SSAMachineSchedulerImpl::run(MachineFunction &Func, + const TargetMachine &TM, + const RequiredAnalyses &Analyses) { + MF = &Func; + MLI = &Analyses.MLI; + MDT = &Analyses.MDT; + this->TM = &TM; + AA = &Analyses.AA; + LIS = &Analyses.LIS; + + if (VerifyScheduling) { + LLVM_DEBUG(LIS->dump()); + const char *MSchedBanner = "Before machine scheduling."; + if (P) + MF->verify(P, MSchedBanner, &errs()); + else + MF->verify(*MFAM, MSchedBanner, &errs()); + } + RegClassInfo->runOnMachineFunction(*MF); + + // Instantiate the selected scheduler for this target, function, and + // optimization level. + std::unique_ptr Scheduler(createMachineScheduler()); + scheduleRegions(*Scheduler, false); + + LLVM_DEBUG(LIS->dump()); + if (VerifyScheduling) { + const char *MSchedBanner = "After machine scheduling."; + if (P) + MF->verify(P, MSchedBanner, &errs()); + else + MF->verify(*MFAM, MSchedBanner, &errs()); + } + return true; +} + /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by /// the caller. We don't have a command line option to override the postRA /// scheduler. The Target must configure it. @@ -668,12 +789,38 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { return Impl.run(MF, TM, {MLI, MDT, AA, LIS}); } +bool SSAMachineScheduler::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + if (EnableSSAMachineSched.getNumOccurrences()) { + if (!EnableSSAMachineSched) + return false; + } else if (!MF.getSubtarget().enableSSAMachineScheduler()) { + return false; + } + + auto &MLI = getAnalysis().getLI(); + auto &MDT = getAnalysis().getDomTree(); + auto &TM = getAnalysis().getTM(); + auto &AA = getAnalysis().getAAResults(); + auto &LIS = getAnalysis().getLIS(); + Impl.setLegacyPass(this); + return Impl.run(MF, TM, {MLI, MDT, AA, LIS}); +} + MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM) : Impl(std::make_unique()), TM(TM) {} MachineSchedulerPass::~MachineSchedulerPass() = default; MachineSchedulerPass::MachineSchedulerPass(MachineSchedulerPass &&Other) = default; +SSAMachineSchedulerPass::SSAMachineSchedulerPass(const TargetMachine *TM) + : Impl(std::make_unique()), TM(TM) {} +SSAMachineSchedulerPass::SSAMachineSchedulerPass( + SSAMachineSchedulerPass &&Other) = default; +SSAMachineSchedulerPass::~SSAMachineSchedulerPass() = default; + PostMachineSchedulerPass::PostMachineSchedulerPass(const TargetMachine *TM) : Impl(std::make_unique()), TM(TM) {} PostMachineSchedulerPass::PostMachineSchedulerPass( @@ -708,6 +855,33 @@ MachineSchedulerPass::run(MachineFunction &MF, .preserve(); } +PreservedAnalyses +SSAMachineSchedulerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (EnableSSAMachineSched.getNumOccurrences()) { + if (!EnableSSAMachineSched) + return PreservedAnalyses::all(); + } else if (!MF.getSubtarget().enableSSAMachineScheduler()) { + LLVM_DEBUG(dbgs() << "Subtarget disables ssa-MI-sched.\n"); + return PreservedAnalyses::all(); + } + + auto &MLI = MFAM.getResult(MF); + auto &MDT = MFAM.getResult(MF); + auto &FAM = MFAM.getResult(MF) + .getManager(); + auto &AA = FAM.getResult(MF.getFunction()); + auto &LIS = MFAM.getResult(MF); + Impl->setMFAM(&MFAM); + bool Changed = Impl->run(MF, *TM, {MLI, MDT, AA, LIS}); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} + bool PostMachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -764,11 +938,10 @@ PostMachineSchedulerPass::run(MachineFunction &MF, /// the boundary, but there would be no benefit to postRA scheduling across /// calls this late anyway. static bool isSchedBoundary(MachineBasicBlock::iterator MI, - MachineBasicBlock *MBB, - MachineFunction *MF, + MachineBasicBlock *MBB, MachineFunction *MF, const TargetInstrInfo *TII) { return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF) || - MI->isFakeUse(); + MI->isFakeUse() || MI->isPHI(); } using MBBRegionsVector = SmallVector; diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index b6169e6c4dc34..2f4c47212215e 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1479,6 +1479,12 @@ void TargetPassConfig::addOptimizedRegAlloc() { addPass(&UnreachableMachineBlockElimID); addPass(&LiveVariablesID); + // Run SSA machine scheduler runs just before PHI elimination. + if (EnableSSAMachineScheduler) { + addPass(&LiveIntervalsID); + addPass(&SSAMachineSchedulerID); + } + // Edge splitting is smarter with machine loop info. addPass(&MachineLoopInfoID); addPass(&PHIEliminationID); diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index cd396e6a619a8..cee5162223f71 100644 --- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -54,6 +54,8 @@ bool TargetSubtargetInfo::enablePostRAMachineScheduler() const { return enableMachineScheduler() && enablePostRAScheduler(); } +bool TargetSubtargetInfo::enableSSAMachineScheduler() const { return false; } + bool TargetSubtargetInfo::useAA() const { return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b5771b6..f8ec24c21efaf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt + UseSSAMachineScheduler("amdgpu-use-ssa-machine-scheduler", + cl::desc("Use the machine scheduler in SSA mode."), + cl::init(false), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); @@ -1255,6 +1260,12 @@ AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM) // Garbage collection is not supported. disablePass(&GCLoweringID); disablePass(&ShadowStackGCLoweringID); + + if (UseSSAMachineScheduler) { + // Use SSA Machine Scheduler instead of regular Machine Scheduler. + disablePass(&MachineSchedulerID); + setEnableSSAMachineScheduler(true); + } } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { @@ -1594,20 +1605,24 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (EnableRewritePartialRegUses) insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); + // Insertion point for passes depends on whether MachineScheduler is enabled. + AnalysisID EndOfPreRA = UseSSAMachineScheduler ? &RenameIndependentSubregsID + : &MachineSchedulerID; + if (isPassEnabled(EnablePreRAOptimizations)) - insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID); + insertPass(EndOfPreRA, &GCNPreRAOptimizationsID); // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation // instructions that cause scheduling barriers. - insertPass(&MachineSchedulerID, &SIWholeQuadModeID); + insertPass(EndOfPreRA, &SIWholeQuadModeID); if (OptExecMaskPreRA) - insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + insertPass(EndOfPreRA, &SIOptimizeExecMaskingPreRAID); // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. if (TM->getOptLevel() > CodeGenOptLevel::Less) - insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + insertPass(EndOfPreRA, &SIFormMemoryClausesID); TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index ef63acc6355d2..2fcc57bdd2100 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -613,6 +613,8 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, for (auto &MO : CurrMI->operands()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; + if (MO.isUse() && CurrMI->getOpcode() == AMDGPU::PHI) + break; if (MO.isUse() && !MO.readsReg()) continue; if (!UseInternalIterator && MO.isDef()) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fab78a93aa063..e92ec4a26193b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1797,6 +1797,10 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { if (UseRegion != MIRegion.end() && UseRegion->second == I) continue; + // Cannot insert instructions before PHIs. + if (UseMI->isPHI()) + continue; + // Do not rematerialize an instruction if it uses or is used by an // instruction that we have designated for rematerialization. // FIXME: Allow for rematerialization chains: this requires 1. updating diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a54d6651c25c1..8ed3ded5758c9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1036,6 +1036,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return true; } + bool enableSSAMachineScheduler() const override { return true; } + bool useAA() const override; bool enableSubRegLiveness() const override { diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index b8814b64735e6..b02c214895aec 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -4,7 +4,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-DEFSCHED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-use-ssa-machine-scheduler=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-SSASCHED %s define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: s_add_i32: @@ -161,19 +162,33 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: s_add_v2i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s2, s4, s6 -; GFX12-NEXT: s_add_co_i32 s3, s5, s7 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: s_add_v2i32: +; GFX12-DEFSCHED: ; %bb.0: +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s2, s4, s6 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s3, s5, s7 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-DEFSCHED-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-DEFSCHED-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: s_add_v2i32: +; GFX12-SSASCHED: ; %bb.0: +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s2, s4, s6 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s3, s5, s7 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SSASCHED-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SSASCHED-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -274,22 +289,39 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: s_add_v4i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s3, s7 -; GFX12-NEXT: s_add_co_i32 s2, s2, s6 -; GFX12-NEXT: s_add_co_i32 s0, s0, s4 -; GFX12-NEXT: s_add_co_i32 s1, s1, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: global_store_b128 v4, v[0:3], s[8:9] -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: s_add_v4i32: +; GFX12-DEFSCHED: ; %bb.0: +; GFX12-DEFSCHED-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s2, s2, s6 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s0, s0, s4 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s1, s1, s5 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX12-DEFSCHED-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-DEFSCHED-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: s_add_v4i32: +; GFX12-SSASCHED: ; %bb.0: +; GFX12-SSASCHED-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s2, s2, s6 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s0, s0, s4 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s1, s1, s5 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-SSASCHED-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX12-SSASCHED-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -436,29 +468,53 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: s_add_v8i32: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s4, s9, s17 -; GFX12-NEXT: s_add_co_i32 s5, s8, s16 -; GFX12-NEXT: s_add_co_i32 s6, s15, s23 -; GFX12-NEXT: s_add_co_i32 s7, s14, s22 -; GFX12-NEXT: s_add_co_i32 s8, s12, s20 -; GFX12-NEXT: s_add_co_i32 s9, s13, s21 -; GFX12-NEXT: s_add_co_i32 s2, s11, s19 -; GFX12-NEXT: s_add_co_i32 s3, s10, s18 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 -; GFX12-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 -; GFX12-NEXT: v_mov_b32_e32 v6, s3 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: s_add_v8i32: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX12-DEFSCHED-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s4, s9, s17 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s5, s8, s16 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s6, s15, s23 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s7, s14, s22 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s8, s12, s20 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s9, s13, s21 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s2, s11, s19 +; GFX12-DEFSCHED-NEXT: s_add_co_i32 s3, s10, s18 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 +; GFX12-DEFSCHED-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-DEFSCHED-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: s_add_v8i32: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX12-SSASCHED-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s4, s9, s17 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s5, s8, s16 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s6, s15, s23 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s7, s14, s22 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s8, s12, s20 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s9, s13, s21 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s2, s11, s19 +; GFX12-SSASCHED-NEXT: s_add_co_i32 s3, s10, s18 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s4 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s2 +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-SSASCHED-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-SSASCHED-NEXT: s_endpgm entry: %0 = add <8 x i32> %a, %b store <8 x i32> %0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index baccb4c7d0859..9fa6c3bf0eb1f 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -4,7 +4,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12,GFX12-DEFSCHED %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -amdgpu-use-ssa-machine-scheduler=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-SSASCHED %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s @@ -106,24 +107,41 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: test_mul_v2i32: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: test_mul_v2i32: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: test_mul_v2i32: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b128 v[0:3], off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: test_mul_v2i32: ; GFX1250: ; %bb.0: ; %entry @@ -283,28 +301,49 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul_v4i32: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null -; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7 -; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6 -; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 -; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul_v4i32: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX12-DEFSCHED-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX12-DEFSCHED-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul_v4i32: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: buffer_load_b128 v[0:3], off, s[4:7], null +; GFX12-SSASCHED-NEXT: buffer_load_b128 v[4:7], off, s[4:7], null offset:16 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX12-SSASCHED-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul_v4i32: ; GFX1250: ; %bb.0: ; %entry @@ -588,28 +627,49 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_trunc_i64_mul_to_i32: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-NEXT: s_mov_b32 s10, -1 -; GFX12-NEXT: s_mov_b32 s11, 0x31016000 -; GFX12-NEXT: s_mov_b32 s14, s10 -; GFX12-NEXT: s_mov_b32 s15, s11 -; GFX12-NEXT: s_mov_b32 s6, s10 -; GFX12-NEXT: s_mov_b32 s7, s11 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s2 -; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null -; GFX12-NEXT: buffer_load_b32 v1, off, s[4:7], null -; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_trunc_i64_mul_to_i32: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s14, s10 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s15, s11 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, s10 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, s11 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s12, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s13, s3 +; GFX12-DEFSCHED-NEXT: buffer_load_b32 v0, off, s[12:15], null +; GFX12-DEFSCHED-NEXT: buffer_load_b32 v1, off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX12-DEFSCHED-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_trunc_i64_mul_to_i32: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-SSASCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-SSASCHED-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-SSASCHED-NEXT: buffer_load_b32 v1, off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX12-SSASCHED-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_trunc_i64_mul_to_i32: ; GFX1250: ; %bb.0: ; %entry @@ -736,18 +796,31 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: mul64_sext_c: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_ashr_i32 s3, s2, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: mul64_sext_c: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: mul64_sext_c: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-SSASCHED-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: mul64_sext_c: ; GFX1250: ; %bb.0: ; %entry @@ -853,17 +926,30 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: mul64_zext_c: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: mul64_zext_c: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: mul64_zext_c: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: mul64_zext_c: ; GFX1250: ; %bb.0: ; %entry @@ -994,24 +1080,41 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul64_sext_c: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 -; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul64_sext_c: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_hi_i32 v1, 0x50, v0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul64_sext_c: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_hi_i32 v1, 0x50, v0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul64_sext_c: ; GFX1250: ; %bb.0: ; %entry @@ -1157,24 +1260,41 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul64_zext_c: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 -; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul64_zext_c: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul64_zext_c: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul64_zext_c: ; GFX1250: ; %bb.0: ; %entry @@ -1316,24 +1436,41 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul64_sext_inline_imm: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 -; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul64_sext_inline_imm: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_hi_i32 v1, 9, v0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, 9, v0 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul64_sext_inline_imm: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_hi_i32 v1, 9, v0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, 9, v0 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul64_sext_inline_imm: ; GFX1250: ; %bb.0: ; %entry @@ -1584,23 +1721,39 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul_i32: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul_i32: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX12-DEFSCHED-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul_i32: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX12-SSASCHED-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul_i32: ; GFX1250: ; %bb.0: ; %entry @@ -1888,27 +2041,47 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul_i1: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null -; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul_i1: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, s6 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, s7 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: buffer_load_u8 v0, off, s[8:11], null +; GFX12-DEFSCHED-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-DEFSCHED-NEXT: buffer_store_b8 v0, off, s[4:7], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul_i1: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: buffer_load_u8 v0, off, s[4:7], null +; GFX12-SSASCHED-NEXT: buffer_load_u8 v1, off, s[4:7], null offset:4 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SSASCHED-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-SSASCHED-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul_i1: ; GFX1250: ; %bb.0: ; %entry @@ -2076,18 +2249,32 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: s_mul_i64: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: s_mul_i64: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: s_mul_i64: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_i64: ; GFX1250: ; %bb.0: ; %entry @@ -2261,34 +2448,59 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul_i64: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-NEXT: s_mov_b32 s10, -1 -; GFX12-NEXT: s_mov_b32 s11, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, s10 -; GFX12-NEXT: s_mov_b32 s7, s11 -; GFX12-NEXT: s_mov_b32 s14, s10 -; GFX12-NEXT: s_mov_b32 s15, s11 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s2 -; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null -; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 -; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 -; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 -; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul_i64: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s10, -1 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, s10 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, s11 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s14, s10 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s15, s11 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s12, s2 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s13, s3 +; GFX12-DEFSCHED-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX12-DEFSCHED-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null +; GFX12-DEFSCHED-NEXT: s_mov_b32 s8, s0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s9, s1 +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-DEFSCHED-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX12-DEFSCHED-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX12-DEFSCHED-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul_i64: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, -1 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s4, s2 +; GFX12-SSASCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s6 +; GFX12-SSASCHED-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s7 +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SSASCHED-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SSASCHED-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX12-SSASCHED-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX12-SSASCHED-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul_i64: ; GFX1250: ; %bb.0: ; %entry @@ -3171,48 +3383,91 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: s_mul_i128: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c -; GFX12-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s7, s3 -; GFX12-NEXT: s_mov_b32 s5, s3 -; GFX12-NEXT: s_mov_b32 s17, s3 -; GFX12-NEXT: s_mov_b32 s19, s3 -; GFX12-NEXT: s_mov_b32 s20, s3 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s2, s8 -; GFX12-NEXT: s_mov_b32 s6, s12 -; GFX12-NEXT: s_mov_b32 s4, s13 -; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3] -; GFX12-NEXT: s_mul_u64 s[24:25], s[4:5], s[2:3] -; GFX12-NEXT: s_mov_b32 s2, s23 -; GFX12-NEXT: s_mov_b32 s16, s9 -; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] -; GFX12-NEXT: s_add_nc_u64 s[12:13], s[24:25], s[2:3] -; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] -; GFX12-NEXT: s_mov_b32 s2, s13 -; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13] -; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] -; GFX12-NEXT: s_mov_b32 s18, s7 -; GFX12-NEXT: s_mov_b32 s23, s3 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] -; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] -; GFX12-NEXT: s_mov_b32 s21, s6 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] -; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[20:21] -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: s_mul_i128: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c +; GFX12-DEFSCHED-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0 +; GFX12-DEFSCHED-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s7, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s5, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s17, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s19, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s20, s3 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, s8 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s6, s12 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s4, s13 +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3] +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[24:25], s[4:5], s[2:3] +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, s23 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s16, s9 +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[12:13], s[24:25], s[2:3] +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, s13 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s13, s3 +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13] +; GFX12-DEFSCHED-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] +; GFX12-DEFSCHED-NEXT: s_mov_b32 s18, s7 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s23, s3 +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] +; GFX12-DEFSCHED-NEXT: s_mov_b32 s21, s6 +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-DEFSCHED-NEXT: s_or_b64 s[6:7], s[22:23], s[20:21] +; GFX12-DEFSCHED-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-DEFSCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-DEFSCHED-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: s_mul_i128: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: s_load_b128 s[8:11], s[4:5], 0x4c +; GFX12-SSASCHED-NEXT: s_load_b128 s[12:15], s[4:5], 0x7c +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0 +; GFX12-SSASCHED-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-SSASCHED-NEXT: s_mov_b32 s7, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s19, s3 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_mov_b32 s6, s8 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s12 +; GFX12-SSASCHED-NEXT: s_mov_b32 s18, s9 +; GFX12-SSASCHED-NEXT: s_mul_u64 s[16:17], s[6:7], s[2:3] +; GFX12-SSASCHED-NEXT: s_mul_u64 s[20:21], s[18:19], s[2:3] +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s13 +; GFX12-SSASCHED-NEXT: s_mul_u64 s[8:9], s[14:15], s[8:9] +; GFX12-SSASCHED-NEXT: s_mul_u64 s[4:5], s[6:7], s[2:3] +; GFX12-SSASCHED-NEXT: s_mul_u64 s[6:7], s[12:13], s[10:11] +; GFX12-SSASCHED-NEXT: s_mov_b32 s10, s17 +; GFX12-SSASCHED-NEXT: s_mov_b32 s11, s3 +; GFX12-SSASCHED-NEXT: s_mul_u64 s[12:13], s[18:19], s[2:3] +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[10:11] +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[6:7] +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s10 +; GFX12-SSASCHED-NEXT: s_mov_b32 s9, s3 +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[2:3] +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s11 +; GFX12-SSASCHED-NEXT: s_mov_b32 s8, s5 +; GFX12-SSASCHED-NEXT: s_mov_b32 s17, s3 +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[8:9] +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, s3 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, s4 +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[4:5], s[12:13], s[8:9] +; GFX12-SSASCHED-NEXT: s_or_b64 s[2:3], s[16:17], s[2:3] +; GFX12-SSASCHED-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-SSASCHED-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-SSASCHED-NEXT: s_mov_b32 s2, -1 +; GFX12-SSASCHED-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_i128: ; GFX1250: ; %bb.0: ; %entry @@ -3500,42 +3755,78 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_mul_i128: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1] -; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 -; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2 -; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] -; GFX12-NEXT: v_mov_b32_e32 v9, v11 -; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 -; GFX12-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10] -; GFX12-NEXT: v_add3_u32 v3, v3, v11, v14 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_u32 v10, s0, v12, v10 -; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add3_u32 v3, v7, v3, v4 -; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v3, vcc_lo -; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] -; GFX12-NEXT: s_endpgm +; GFX12-DEFSCHED-LABEL: v_mul_i128: +; GFX12-DEFSCHED: ; %bb.0: ; %entry +; GFX12-DEFSCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-DEFSCHED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 +; GFX12-DEFSCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-DEFSCHED-NEXT: s_clause 0x1 +; GFX12-DEFSCHED-NEXT: global_load_b128 v[0:3], v13, s[0:1] +; GFX12-DEFSCHED-NEXT: global_load_b128 v[4:7], v13, s[2:3] +; GFX12-DEFSCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v7, v7, v0 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] +; GFX12-DEFSCHED-NEXT: v_mov_b32_e32 v9, v11 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 +; GFX12-DEFSCHED-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10] +; GFX12-DEFSCHED-NEXT: v_add3_u32 v3, v3, v11, v14 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_add_co_u32 v10, s0, v12, v10 +; GFX12-DEFSCHED-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] +; GFX12-DEFSCHED-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-DEFSCHED-NEXT: v_add3_u32 v3, v7, v3, v4 +; GFX12-DEFSCHED-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX12-DEFSCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-DEFSCHED-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v3, vcc_lo +; GFX12-DEFSCHED-NEXT: global_store_b128 v13, v[8:11], s[2:3] +; GFX12-DEFSCHED-NEXT: s_endpgm +; +; GFX12-SSASCHED-LABEL: v_mul_i128: +; GFX12-SSASCHED: ; %bb.0: ; %entry +; GFX12-SSASCHED-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-SSASCHED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SSASCHED-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 +; GFX12-SSASCHED-NEXT: s_wait_kmcnt 0x0 +; GFX12-SSASCHED-NEXT: s_clause 0x1 +; GFX12-SSASCHED-NEXT: global_load_b128 v[0:3], v13, s[0:1] +; GFX12-SSASCHED-NEXT: global_load_b128 v[4:7], v13, s[2:3] +; GFX12-SSASCHED-NEXT: s_wait_loadcnt 0x0 +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v7, v7, v0 +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] +; GFX12-SSASCHED-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SSASCHED-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX12-SSASCHED-NEXT: v_mov_b32_e32 v9, v11 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10] +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SSASCHED-NEXT: v_add3_u32 v3, v7, v3, v4 +; GFX12-SSASCHED-NEXT: v_add_co_u32 v10, s0, v12, v10 +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SSASCHED-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX12-SSASCHED-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] +; GFX12-SSASCHED-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SSASCHED-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX12-SSASCHED-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v3, vcc_lo +; GFX12-SSASCHED-NEXT: global_store_b128 v13, v[8:11], s[2:3] +; GFX12-SSASCHED-NEXT: s_endpgm ; ; GFX1250-LABEL: v_mul_i128: ; GFX1250: ; %bb.0: ; %entry