From eb4594624e505ae7a0d7bca13c4d54e5d5425a0d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 1 Feb 2024 13:57:13 -0800 Subject: [PATCH 1/3] [AMDGPU] Introduce IGLPPhase Change-Id: I3690e082b98b57392075cac783b853f3fb48b0e5 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 28 +++++++++---------- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 10 ++++++- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 ++-- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 6 ++-- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 4462cd8a31f13..74b62f22aff21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -852,7 +852,7 @@ class IGLPStrategy { virtual void applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, - bool IsReentry) = 0; + IGLPPhase Phase) = 0; // Returns true if this strategy should be applied to a ScheduleDAG. virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; @@ -871,7 +871,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy { void applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, - bool IsReentry) override; + IGLPPhase Phase) override; bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } @@ -884,7 +884,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy { void MFMASmallGemmOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, - bool IsReentry) { + IGLPPhase Phase) { // Count the number of MFMA instructions. unsigned MFMACount = 0; for (const MachineInstr &I : *DAG) @@ -1101,7 +1101,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { void applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, - bool IsReentry) override; + IGLPPhase Phase) override; bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } @@ -1118,11 +1118,11 @@ static unsigned DSWWithSharedVMEMCount = 0; void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, - bool IsReentry) { + IGLPPhase Phase) { unsigned MFMACount = 0; unsigned DSRCount = 0; - assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 && + assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 && DSWWithSharedVMEMCount == 0)) && "DSWCounters should be zero in pre-RA scheduling!"); SmallVector DSWithPerms; @@ -1133,7 +1133,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( else if (TII->isDS(*I)) { if (I->mayLoad()) ++DSRCount; - else if (I->mayStore() && !IsReentry) { + else if (I->mayStore() && Phase == IGLPPhase::Initial) { ++DSWCount; for (auto Pred : SU.Preds) { if (Pred.getSUnit()->getInstr()->getOpcode() == @@ -1146,7 +1146,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( } } - if (!IsReentry) { + if (Phase == IGLPPhase::Initial) { DSWWithPermCount = DSWithPerms.size(); auto I = DSWithPerms.begin(); auto E = DSWithPerms.end(); @@ -1414,10 +1414,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { bool IsBottomUp = 1; // Whether or not this is a reentry into the IGroupLPDAGMutation. - bool IsReentry = false; + IGLPPhase Phase = IGLPPhase::Initial; IGroupLPDAGMutation() = default; - IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {} + IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {} }; unsigned SchedGroup::NumSchedGroups = 0; @@ -1717,7 +1717,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { auto S = createIGLPStrategy(StrategyID, DAG, TII); if (S->shouldApplyStrategy(DAG)) { IsBottomUp = S->IsBottomUp; - S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry); + S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); } } @@ -1725,13 +1725,13 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { namespace llvm { -/// \p IsReentry specifes whether or not this is a reentry into the +/// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. -std::unique_ptr createIGroupLPDAGMutation(bool IsReentry) { - return std::make_unique(IsReentry); +std::unique_ptr createIGroupLPDAGMutation(IGLPPhase Phase) { + return std::make_unique(Phase); } } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index 3ec8be4f88920..0b72c3dbecce1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -14,7 +14,15 @@ namespace llvm { -std::unique_ptr createIGroupLPDAGMutation(bool IsReentry); +// Components of the mask that determines which instruction types may be may be +// classified into a SchedGroup. +enum class IGLPPhase { + Initial = 0u, + PreRAReentry = 1u << 0, + PostRA = 1u << 1 +}; + +std::unique_ptr createIGroupLPDAGMutation(IGLPPhase Phase); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b8a7a5e208021..cfebf72c4f42a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -471,7 +471,7 @@ static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); - DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial)); return DAG; } @@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); - DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA)); if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) DAG->addMutation(createVOPDPairingMutation()); return DAG; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 4081115aa68ca..e67a296bcada1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { return false; SavedMutations.swap(DAG.Mutations); - DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); + DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry)); InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP @@ -855,7 +855,7 @@ bool GCNSchedStage::initGCNRegion() { SavedMutations.swap(DAG.Mutations); bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || StageID == GCNSchedStageID::ILPInitialSchedule; - DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage)); + DAG.addMutation(createIGroupLPDAGMutation(IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry)); } return true; @@ -1569,7 +1569,7 @@ void GCNPostScheduleDAGMILive::schedule() { if (HasIGLPInstrs) { SavedMutations.clear(); SavedMutations.swap(Mutations); - addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true)); + addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA)); } ScheduleDAGMI::schedule(); From d7a6c9036af149d86b82745f3efa98e82d722009 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 13 Feb 2024 16:28:58 -0800 Subject: [PATCH 2/3] [AMDGPU] Fall back to SavedMutations when not applying IGLP Change-Id: Ib5fac752b9302e8817d7dcc1aded264e0d0b1b7f --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 49 +++++++++++++------ llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 5 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 +-- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 9 ++-- 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 74b62f22aff21..5d7407db594c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -849,7 +849,7 @@ class IGLPStrategy { public: /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. - virtual void applyIGLPStrategy( + virtual bool applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, IGLPPhase Phase) = 0; @@ -868,7 +868,7 @@ class IGLPStrategy { class MFMASmallGemmOpt final : public IGLPStrategy { private: public: - void applyIGLPStrategy( + bool applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, IGLPPhase Phase) override; @@ -881,7 +881,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy { } }; -void MFMASmallGemmOpt::applyIGLPStrategy( +bool MFMASmallGemmOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, IGLPPhase Phase) { @@ -902,6 +902,8 @@ void MFMASmallGemmOpt::applyIGLPStrategy( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); } + + return true; } class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { @@ -1098,7 +1100,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { }; public: - void applyIGLPStrategy( + bool applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, IGLPPhase Phase) override; @@ -1115,7 +1117,7 @@ static unsigned DSWCount = 0; static unsigned DSWWithPermCount = 0; static unsigned DSWWithSharedVMEMCount = 0; -void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( +bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups, IGLPPhase Phase) { @@ -1354,6 +1356,8 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); } + + return true; } static std::unique_ptr @@ -1375,6 +1379,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { ScheduleDAGMI *DAG; + std::vector> *SavedMutations; + // Organize lists of SchedGroups by their SyncID. SchedGroups / // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added // between then. @@ -1401,7 +1407,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { void initSchedGroupBarrierPipelineStage( std::vector::reverse_iterator RIter); - void initIGLPOpt(SUnit &SU); + bool initIGLPOpt(SUnit &SU); public: void apply(ScheduleDAGInstrs *DAGInstrs) override; @@ -1417,7 +1423,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { IGLPPhase Phase = IGLPPhase::Initial; IGroupLPDAGMutation() = default; - IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {} + IGroupLPDAGMutation( + IGLPPhase Phase, + std::vector> *SavedMutations) + : SavedMutations(SavedMutations), Phase(Phase) {} }; unsigned SchedGroup::NumSchedGroups = 0; @@ -1622,8 +1631,7 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } else if (Opc == AMDGPU::IGLP_OPT) { resetEdges(*R, DAG); if (!foundSB && !foundIGLP) - initIGLPOpt(*R); - foundIGLP = true; + foundIGLP |= initIGLPOpt(*R); } } @@ -1632,7 +1640,13 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { // PipelineSolver performs the mutation by adding the edges it // determined as the best PS.solve(); + return; } + + // !foundSB && !foundIGLP -- most likely we have an ILGP_OPT instruciton but + // did not apply any mutation + for (auto &m : *SavedMutations) + m->apply(DAG); } void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { @@ -1711,14 +1725,15 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); } -void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { +bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { IGLPStrategyID StrategyID = (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); auto S = createIGLPStrategy(StrategyID, DAG, TII); - if (S->shouldApplyStrategy(DAG)) { - IsBottomUp = S->IsBottomUp; - S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); - } + if (!S->shouldApplyStrategy(DAG)) + return false; + + IsBottomUp = S->IsBottomUp; + return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); } } // namespace @@ -1730,8 +1745,10 @@ namespace llvm { /// same scheduling region (e.g. pre and post-RA scheduling / multiple /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. -std::unique_ptr createIGroupLPDAGMutation(IGLPPhase Phase) { - return std::make_unique(Phase); +std::unique_ptr createIGroupLPDAGMutation( + IGLPPhase Phase, + std::vector> *SavedMutations) { + return std::make_unique(Phase, SavedMutations); } } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index 0b72c3dbecce1..4d5e9a17401de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -11,6 +11,7 @@ #include "llvm/CodeGen/ScheduleDAGMutation.h" #include +#include namespace llvm { @@ -22,7 +23,9 @@ enum class IGLPPhase { PostRA = 1u << 1 }; -std::unique_ptr createIGroupLPDAGMutation(IGLPPhase Phase); +std::unique_ptr createIGroupLPDAGMutation( + IGLPPhase Phase, + std::vector> *SavedMutations); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index cfebf72c4f42a..8d94eff2bd26d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -471,7 +471,7 @@ static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); - DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr)); return DAG; } @@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); - DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA)); + DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA, nullptr)); if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) DAG->addMutation(createVOPDPairingMutation()); return DAG; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index e67a296bcada1..8ce5a4cb58586 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { return false; SavedMutations.swap(DAG.Mutations); - DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry)); + DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry, nullptr)); InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP @@ -855,7 +855,9 @@ bool GCNSchedStage::initGCNRegion() { SavedMutations.swap(DAG.Mutations); bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || StageID == GCNSchedStageID::ILPInitialSchedule; - DAG.addMutation(createIGroupLPDAGMutation(IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry)); + DAG.addMutation(createIGroupLPDAGMutation( + IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry, + &SavedMutations)); } return true; @@ -1569,7 +1571,8 @@ void GCNPostScheduleDAGMILive::schedule() { if (HasIGLPInstrs) { SavedMutations.clear(); SavedMutations.swap(Mutations); - addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA)); + addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA, + &SavedMutations)); } ScheduleDAGMI::schedule(); From 51547fb7dd39f4d7ff6f546f672915a6e5c77aab Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 25 Jan 2024 12:44:12 -0800 Subject: [PATCH 3/3] [AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp interleaving Change-Id: I2754be185b8d2ccb186b6f6596d4cf7c423c5872 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 816 ++++++- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 6 +- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 2055 +++++++++++++++++ .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 900 ++++++++ 4 files changed, 3749 insertions(+), 28 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5d7407db594c8..eb4f1273707e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -140,8 +140,6 @@ class SchedGroup { // Count of the number of created SchedGroups, used to initialize SGID. static unsigned NumSchedGroups; - const SIInstrInfo *TII; - // Try to add and edge from SU A to SU B. bool tryAddEdge(SUnit *A, SUnit *B); @@ -154,6 +152,7 @@ class SchedGroup { SmallVector Collection; ScheduleDAGInstrs *DAG; + const SIInstrInfo *TII; // Returns true if SU can be added to this SchedGroup. bool canAddSU(SUnit &SU) const; @@ -234,13 +233,13 @@ class SchedGroup { SchedGroup(SchedGroupMask SGMask, std::optional MaxSize, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { + : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { SGID = NumSchedGroups++; } SchedGroup(SchedGroupMask SGMask, std::optional MaxSize, int SyncID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { + : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { SGID = NumSchedGroups++; } }; @@ -442,7 +441,8 @@ void PipelineSolver::convertSyncMapsToArrays() { template void PipelineSolver::linkSchedGroups(T I, T E) { for (; I != E; ++I) { auto &GroupA = *I; - for (auto J = std::next(I); J != E; ++J) { + auto J = std::next(I); + for (; J != E; ++J) { auto &GroupB = *J; GroupA.link(GroupB); } @@ -488,7 +488,9 @@ int PipelineSolver::linkSUnit( continue; } auto Group = *I; - AddedCost += Group.link(*SU, MakePred, AddedEdges); + auto Temp = Group.link(*SU, MakePred, AddedEdges); + + AddedCost += Temp; assert(AddedCost >= 0); } return AddedCost; @@ -633,6 +635,7 @@ bool PipelineSolver::solveExact() { assert(static_cast(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()); SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); @@ -785,6 +788,7 @@ bool PipelineSolver::solveGreedy() { while (static_cast(CurrSyncGroupIdx) < PipelineInstrs.size()) { SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + IsBottomUp ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); @@ -838,6 +842,7 @@ void PipelineSolver::solve() { enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, MFMASmallGemmSingleWaveOptID = 1, + MFMAExpInterleave = 2 }; // Implement a IGLP scheduling strategy. @@ -855,7 +860,7 @@ class IGLPStrategy { IGLPPhase Phase) = 0; // Returns true if this strategy should be applied to a ScheduleDAG. - virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; + virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, IGLPPhase Phase) = 0; bool IsBottomUp = 1; @@ -873,7 +878,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy { DenseMap> &SyncedSchedGroups, IGLPPhase Phase) override; - bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, IGLPPhase Phase) override { + return true; + } MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { @@ -906,6 +913,761 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( return true; } +class MFMAExpInterleaveOpt final : public IGLPStrategy { +private: + /// Whether or not the instruction is a transitive predecessor of an MFMA + /// instruction + class IsPipeExp final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + + auto DAG = SyncPipe[0].DAG; + auto TII = SyncPipe[0].TII; + + if (Cache->empty()) { + auto I = DAG->SUnits.rbegin(); + auto E = DAG->SUnits.rend(); + for (; I != E; I++) { + if (TII->isMFMAorWMMA(*(I->getInstr()))) + Cache->push_back(&*I); + } + } + + if (Cache->empty()) + return false; + + auto Reaches = (std::any_of( + Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) { + return DAG->IsReachable(TargetSU, const_cast(SU)); + })); + + return Reaches; + } + IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + /// Whether or not the insturction is a transitive predecessor of the same + /// MFMA instruction as an instruction in a SchedGroup \p Number steps before + class ProduceSameMFMAWithPrevN final : public InstructionRule { + private: + unsigned Number = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - Number) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + auto DAG = SyncPipe[0].DAG; + + if (Cache->empty()) { + auto TII = SyncPipe[0].TII; + SmallVector Worklist; + + auto I = DAG->SUnits.rbegin(); + auto E = DAG->SUnits.rend(); + for (; I != E; I++) + if (TII->isMFMAorWMMA(*(I->getInstr()))) + Worklist.push_back(&*I); + + for (auto BaseSU : OtherGroup->Collection) { + if (!Cache->empty()) + break; + for (auto CandSU : Worklist) { + if (DAG->IsReachable(CandSU, BaseSU)) { + Cache->push_back(CandSU); + break; + } + } + } + } + if (Cache->empty()) + return false; + + return DAG->IsReachable((*Cache)[0], const_cast(SU)); + } + + ProduceSameMFMAWithPrevN(unsigned Number, const SIInstrInfo *TII, + unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} + }; + + /// Whether or not the instruction has less than \p Size immediate successors + class LessThanNSuccs final : public InstructionRule { + private: + unsigned Size = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + if (!SyncPipe.size()) + return false; + + return SU->Succs.size() < Size; + } + LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Size(Size) {} + }; + + // Whether or not the instruction is an V_CVT instruction. + class IsPipelineCvt final : public InstructionRule { + private: + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + auto Opc = SU->getInstr()->getOpcode(); + return Opc == AMDGPU::V_CVT_F16_F32_e32 || + Opc == AMDGPU::V_CVT_I32_F32_e32; + } + IsPipelineCvt(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether or not the instruction is an V_FMA_F32 instruction. + class IsFMAF32 final : public InstructionRule { + private: + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64; + } + IsFMAF32(unsigned Val, const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + /// Whether or not the instruction is an immediate RAW successor + /// of the SchedGroup \p Distance steps before. + class IsSuccOfPrevNthGroup final : public InstructionRule { + private: + unsigned Distance = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - Distance) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + for (auto &OtherEle : OtherGroup->Collection) { + for (auto &Succ : OtherEle->Succs) { + if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data) + return true; + } + } + + return false; + } + IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, + unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} + }; + + /// Whether or not the instruction is a transitive successor of any + /// instruction the the SchedGroup \p Distance steps before. + class IsReachableFromPrevNthGroup final : public InstructionRule { + private: + unsigned Distance = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - Distance) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + auto DAG = SyncPipe[0].DAG; + + for (auto &OtherEle : OtherGroup->Collection) + if (DAG->IsReachable(const_cast(SU), OtherEle)) + return true; + + return false; + } + IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, + unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} + }; + + /// Whether or not the instruction is the \p Number th occuring DS_READ + /// instruction + class IsNthDSRead final : public InstructionRule { + private: + unsigned Number = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + + auto DAG = SyncPipe[0].DAG; + auto TII = SyncPipe[0].TII; + unsigned Counter = 0; + if (Cache->empty()) { + for (auto &ParseSU : DAG->SUnits) { + auto MI = ParseSU.getInstr(); + if (TII->isDS(MI->getOpcode()) && MI->mayLoad()) { + if (Counter == Number) { + Cache->push_back(&ParseSU); + break; + } + + ++Counter; + } + } + } + + if (Cache->empty()) + return false; + + return (*Cache)[0]->NodeNum <= SU->NodeNum; + } + IsNthDSRead(unsigned Number, const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} + }; + + // Whether or not the instruction is a transitive predecessor of any TRANS + // instruction + class IsPipeMFMA final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + + SmallVector Worklist; + auto DAG = SyncPipe[0].DAG; + auto TII = SyncPipe[0].TII; + if (Cache->empty()) { + for (auto &SU : DAG->SUnits) + if (TII->isTRANS(SU.getInstr()->getOpcode())) + Cache->push_back(&SU); + } + + if (Cache->empty()) + return false; + + return !( + std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *BaseSU) { + return DAG->IsReachable(BaseSU, const_cast(SU)); + })); + } + + IsPipeMFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the instruction occurs after the first TRANS instruction. This + // implies the instruction can not be a predecessor of the first TRANS + // insruction + class OccursAfterExp final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + + SmallVector Worklist; + auto DAG = SyncPipe[0].DAG; + auto TII = SyncPipe[0].TII; + if (Cache->empty()) { + for (auto &SU : DAG->SUnits) + if (TII->isTRANS(SU.getInstr()->getOpcode())) { + Cache->push_back(&SU); + break; + } + } + + if (Cache->empty()) + return false; + + return SU->NodeNum > (*Cache)[0]->NodeNum; + } + + OccursAfterExp(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the SU is a not a successor of any element in the previous + // SchedGroup + class IsNotSuccOfPrevGroup final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 1) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + // Does the previous VALU have this DS_Write as a successor + return !(std::any_of(OtherGroup->Collection.begin(), + OtherGroup->Collection.end(), [&SU](SUnit *Elt) { + return std::any_of(Elt->Succs.begin(), + Elt->Succs.end(), + [&SU](SDep &Succ) { + return Succ.getSUnit() == SU; + }); + })); + } + IsNotSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + +public: + bool applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups, + IGLPPhase Phase) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, IGLPPhase Phase) override; + + MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = 0; + } +}; + +static unsigned TransPipeCount = 0; +static unsigned MFMAPipeCount = 0; +static unsigned MFMAEnablement = 0; +static unsigned ExpRequirement = 0; + +bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, + IGLPPhase Phase) { + if (Phase != IGLPPhase::Initial) + return true; + + const GCNSubtarget &ST = DAG->MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + SmallVector ExpPipeCands; + SmallVector MFMAPipeCands; + SmallVector MFMAPipeSUs; + SmallVector PackSUs; + + auto isBitPack = [](unsigned Opc) { + return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; + }; + for (SUnit &SU : DAG->SUnits) { + auto Opc = SU.getInstr()->getOpcode(); + if (TII->isTRANS(Opc)) { + // Avoid counting a potential bonus V_EXP which all the MFMA depend on + if (SU.Succs.size() >= 7) + continue; + ExpPipeCands.push_back(&SU); + } + + if (TII->isMFMAorWMMA(*SU.getInstr())) + MFMAPipeCands.push_back(&SU); + + if (isBitPack(Opc)) + PackSUs.push_back(&SU); + } + + if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size())) + return false; + + TransPipeCount = 0; + MFMAPipeCount = 0; + MFMAEnablement = 0; + ExpRequirement = 0; + + std::optional TempMFMA; + std::optional TempExp; + // Count the number of EXPs that reach an MFMA + for (auto &PredSU : ExpPipeCands) { + for (auto &SuccSU : MFMAPipeCands) { + if (DAG->IsReachable(SuccSU, PredSU)) { + if (!TempExp) { + TempExp = PredSU; + TempMFMA = SuccSU; + } + MFMAPipeSUs.push_back(SuccSU); + ++TransPipeCount; + break; + } + } + } + + if (!TempExp) + return false; + + // Count the number of MFMAs that are reached by an EXP + for (auto &SuccSU : MFMAPipeCands) { + if (std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(), + [&SuccSU](SUnit *PotentialMatch) { + return PotentialMatch == SuccSU; + }) != MFMAPipeSUs.end()) { + ++MFMAPipeCount; + continue; + } + for (auto &PredSU : ExpPipeCands) { + if (DAG->IsReachable(SuccSU, PredSU)) { + ++MFMAPipeCount; + break; + } + } + } + + if (!TempMFMA || !TempExp) + return false; + + // The number of bit pack operations that depend on a single V_EXP + unsigned PackSuccCount = std::count_if( + PackSUs.begin(), PackSUs.end(), [&TempExp, &DAG](SUnit *VPack) { + return DAG->IsReachable(VPack, *TempExp); + }); + + // The number of bit pack operations an MFMA depends on + unsigned PackPredCount = + std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), + [&isBitPack](SDep &Pred) { + auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); + return isBitPack(Opc); + }); + + auto PackPred = + std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), + [&isBitPack](SDep &Pred) { + auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); + return isBitPack(Opc); + }); + + if (PackPred == (*TempMFMA)->Preds.end()) + return false; + + // How many MFMAs depend on a single bit pack operation + MFMAEnablement = + std::count_if(PackPred->getSUnit()->Succs.begin(), + PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) { + return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); + }); + + // The number of MFMAs that depend on a single V_EXP + MFMAEnablement *= PackSuccCount; + + // The number of V_EXPs required to resolve all dependencies for an MFMA + ExpRequirement = + std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(), + [&PackPred, &DAG](SUnit *ExpBase) { + return DAG->IsReachable(PackPred->getSUnit(), ExpBase); + }); + + ExpRequirement *= PackPredCount; + + return true; +} + +bool MFMAExpInterleaveOpt::applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups, + IGLPPhase Phase) { + + const GCNSubtarget &ST = DAG->MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + bool IsSmallKernelType = + MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; + bool IsLargeKernelType = + MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64; + + if (!(IsSmallKernelType || IsLargeKernelType)) + return false; + + unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + + if (IsSmallKernelType && Phase != IGLPPhase::PostRA) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned I = 0; I < 4; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared(1 + 2 * I, TII, + SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + for (unsigned I = 0; I < (TransPipeCount - 8) / 2; ++I) { + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned J = 0; J < 2; J++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared(I == 0 ? 8 : 9, TII, + SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + } + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + if (IsLargeKernelType && Phase != IGLPPhase::PostRA) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 2, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned I = 0; I < 3; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared( + I == 0 ? 1 : 4, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(32, TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned I = 0; I < 2; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule( + std::make_shared(5, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + for (unsigned I = 0; I < (TransPipeCount - 8) / 4; I++) { + for (unsigned J = 0; J < 2; J++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared( + I == 0 ? 16 + J : 20, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared( + I == 0 ? 5 + J : 6, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(32, TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned J = 0; J < 2; J++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared( + I == 0 ? 19 : 21 - J, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared( + 7, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + } + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(15 + 1 * 5, TII, + SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule( + std::make_shared(7, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(1, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(15 + 1 * 5, TII, + SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule(std::make_shared( + 6, TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(14 + 1 * 4, TII, + SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID())); + SG->addRule( + std::make_shared(4, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 5, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + if (IsLargeKernelType && (Phase == IGLPPhase::PostRA)) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 6, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + for (unsigned I = 0; I < (TransPipeCount - 7); I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(10, TII, SG->getSGID())); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 6, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + return true; +} + class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { private: // Whether the DS_READ is a predecessor of first four MFMA in region @@ -1105,7 +1867,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { DenseMap> &SyncedSchedGroups, IGLPPhase Phase) override; - bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, IGLPPhase Phase) override { + return true; + } MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { @@ -1124,9 +1888,10 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( unsigned MFMACount = 0; unsigned DSRCount = 0; - assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 && - DSWWithSharedVMEMCount == 0)) && - "DSWCounters should be zero in pre-RA scheduling!"); + assert( + (Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 && + DSWWithSharedVMEMCount == 0)) && + "DSWCounters should be zero in pre-RA scheduling!"); SmallVector DSWithPerms; for (auto &SU : DAG->SUnits) { auto I = SU.getInstr(); @@ -1256,14 +2021,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 1, TII, SG->getSGID(), true)); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1274,7 +2039,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 3, TII, SG->getSGID(), true)); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1292,7 +2057,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1313,7 +2078,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1327,7 +2092,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1338,7 +2103,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 2, TII, SG->getSGID(), true)); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1349,7 +2114,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared( 4, TII, SG->getSGID(), true)); - SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->addRule(std::make_shared(TII, SG->getSGID())); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( @@ -1368,6 +2133,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, return std::make_unique(DAG, TII); case MFMASmallGemmSingleWaveOptID: return std::make_unique(DAG, TII); + case MFMAExpInterleave: + return std::make_unique(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -1600,9 +2367,9 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { auto &SU = *I; if (isFull()) break; - - if (canAddSU(SU)) + if (canAddSU(SU)) { SyncedInstrs[&SU].push_back(SGID); + } } } @@ -1643,6 +2410,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { return; } + if (!SavedMutations) + return; + // !foundSB && !foundIGLP -- most likely we have an ILGP_OPT instruciton but // did not apply any mutation for (auto &m : *SavedMutations) @@ -1729,7 +2499,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { IGLPStrategyID StrategyID = (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); auto S = createIGLPStrategy(StrategyID, DAG, TII); - if (!S->shouldApplyStrategy(DAG)) + if (!S->shouldApplyStrategy(DAG, Phase)) return false; IsBottomUp = S->IsBottomUp; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index 4d5e9a17401de..ec48430ba3684 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -17,11 +17,7 @@ namespace llvm { // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. -enum class IGLPPhase { - Initial = 0u, - PreRAReentry = 1u << 0, - PostRA = 1u << 1 -}; +enum class IGLPPhase { Initial = 0u, PreRAReentry = 1u << 0, PostRA = 1u << 1 }; std::unique_ptr createIGroupLPDAGMutation( IGLPPhase Phase, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir new file mode 100644 index 0000000000000..5ae2eb43180fa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -0,0 +1,2055 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @largeInterleave() #0 { ret void } + ; GCN-LABEL: largeInterleave: + ; GCN: ; %bb.0: + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr94 + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ; implicit-def: $vgpr106 + ; GCN-NEXT: ; implicit-def: $vgpr128 + ; GCN-NEXT: ; implicit-def: $vgpr129 + ; GCN-NEXT: ; implicit-def: $vgpr135 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 + ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 + ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 + ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 + ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s0, s7, 7 + ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 + ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 + ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: ; kill: killed $vgpr92 + ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 + ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] + ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] + ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; kill: killed $vgpr72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] + ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] + ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 + ; GCN-NEXT: ; implicit-def: $vgpr73 + ; GCN-NEXT: v_add_u32_e32 v76, v128, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; kill: killed $vgpr72 + ; GCN-NEXT: v_add_u32_e32 v72, v128, v73 + ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: v_add_u32_e32 v72, v128, v74 + ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v72, v128, v75 + ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; kill: killed $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ; implicit-def: $sgpr8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 + ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] + ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] + ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] + ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 + ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 + ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 + ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] + ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 + ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 + ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 + ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] + ; GCN-NEXT: s_nop 5 + ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 + ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 + ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 + ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 + ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 + ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 + ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 + ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 + ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 + ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 + ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 + ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 + ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 + ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 + ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 + ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 + ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 + ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 + ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] + ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 + ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 + ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 + ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 + ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 + ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 + ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 + ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 + ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 + ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 + ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 + ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 + ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 + ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 + ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 + ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 + ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 + ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 + ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 + ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 + ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 + ; GCN-NEXT: ; implicit-def: $vgpr65 + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: ; implicit-def: $vgpr71 + ; GCN-NEXT: ; implicit-def: $vgpr69 + ; GCN-NEXT: ; implicit-def: $vgpr70 + ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 + ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 + ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 + ; GCN-NEXT: v_add_lshl_u32 v131, v66, v65, 1 + ; GCN-NEXT: ds_bpermute_b32 v65, v129, v64 + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: v_lshl_add_u32 v132, v66, 1, v131 + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: v_lshl_add_u32 v133, v66, 1, v132 + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 + ; GCN-NEXT: v_lshl_add_u32 v134, v66, 1, v133 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v131, v[94:95] + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v129, v64 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v132, v[98:99] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v133, v[102:103] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v134, v[96:97] + ; GCN-NEXT: v_add_u32_e32 v71, v128, v71 + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: ; implicit-def: $vgpr65 + ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v130, v66, v64 + ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v130 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v130 + ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v130 + ; GCN-NEXT: v_exp_f32_e32 v49, v48 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_exp_f32_e32 v50, v48 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v66, s4, v52, -v130 + ; GCN-NEXT: v_fma_f32 v68, s4, v53, -v130 + ; GCN-NEXT: v_mul_f32_e32 v53, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v66, s4, v54, -v130 + ; GCN-NEXT: ; implicit-def: $vgpr54 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[152:153], v71, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v54, v128, v54 + ; GCN-NEXT: buffer_load_dwordx2 v[154:155], v54, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v54, v128, v69 + ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v54, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v54, v128, v70 + ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v54, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v130 + ; GCN-NEXT: v_exp_f32_e32 v51, v48 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_exp_f32_e32 v52, v48 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v51 + ; GCN-NEXT: ds_read_b128 v[110:113], v135 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v52 + ; GCN-NEXT: ds_read_b128 v[114:117], v135 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[118:121], v135 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 + ; GCN-NEXT: v_pack_b32_f16 v149, v48, v54 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_exp_f32_e32 v54, v48 + ; GCN-NEXT: v_sub_f32_e32 v48, v65, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_exp_f32_e32 v48, v48 + ; GCN-NEXT: v_fma_f32 v80, s4, v55, -v130 + ; GCN-NEXT: v_pack_b32_f16 v148, v64, v67 + ; GCN-NEXT: v_mul_f32_e32 v55, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v96, s4, v56, -v130 + ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: v_mul_f32_e32 v56, 0x3fb8aa3b, v80 + ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_exp_f32_e32 v53, v53 + ; GCN-NEXT: v_exp_f32_e32 v55, v55 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[110:111], v[148:149], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v56, v56 + ; GCN-NEXT: ds_read_b128 v[136:139], v135 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v122, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v54 + ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v130 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v161, s4, v61, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[114:115], v[148:149], v[80:95] + ; GCN-NEXT: v_fma_f32 v115, s4, v58, -v130 + ; GCN-NEXT: v_exp_f32_e32 v58, v96 + ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GCN-NEXT: v_cvt_f16_f32_e32 v114, v55 + ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v150, v122, v123 + ; GCN-NEXT: v_fma_f32 v164, s4, v62, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[118:119], v[148:149], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v118, v56 + ; GCN-NEXT: v_cvt_f16_f32_e32 v160, v58 + ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v130 + ; GCN-NEXT: v_mul_f32_e32 v44, 0x3fb8aa3b, v44 + ; GCN-NEXT: v_pack_b32_f16 v151, v114, v118 + ; GCN-NEXT: v_fma_f32 v114, s4, v59, -v130 + ; GCN-NEXT: v_exp_f32_e32 v59, v57 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v115 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[112:113], v[150:151], v[64:79] + ; GCN-NEXT: v_fma_f32 v112, s4, v60, -v130 + ; GCN-NEXT: v_exp_f32_e32 v60, v57 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v162, v59 + ; GCN-NEXT: v_fma_f32 v45, s4, v45, -v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v163, v60 + ; GCN-NEXT: v_fma_f32 v46, s4, v46, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[116:117], v[150:151], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v61, v57 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_pack_b32_f16 v160, v160, v162 + ; GCN-NEXT: v_fma_f32 v47, s4, v47, -v130 + ; GCN-NEXT: v_mul_f32_e32 v45, 0x3fb8aa3b, v45 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v130 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[120:121], v[150:151], v[96:111] + ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GCN-NEXT: v_exp_f32_e32 v62, v57 + ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: ; implicit-def: $vgpr57 + ; GCN-NEXT: ds_read_b128 v[140:143], v57 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[136:137], v[148:149], v[112:127] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v61 + ; GCN-NEXT: v_fma_f32 v137, s4, v63, -v130 + ; GCN-NEXT: v_mul_f32_e32 v63, 0x3fb8aa3b, v161 + ; GCN-NEXT: v_exp_f32_e32 v63, v63 + ; GCN-NEXT: v_pack_b32_f16 v161, v163, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_fma_f32 v163, s4, v34, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[138:139], v[150:151], v[112:127] + ; GCN-NEXT: v_fma_f32 v138, s4, v32, -v130 + ; GCN-NEXT: v_exp_f32_e32 v32, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v162, v62 + ; GCN-NEXT: v_fma_f32 v164, s4, v36, -v130 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v130 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] + ; GCN-NEXT: v_fma_f32 v141, s4, v33, -v130 + ; GCN-NEXT: v_exp_f32_e32 v33, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v63 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v130 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v130 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v34, v136 + ; GCN-NEXT: ds_read_b128 v[136:139], v57 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v32 + ; GCN-NEXT: ds_read_b128 v[148:151], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[136:137], v[160:161], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v33 + ; GCN-NEXT: v_perm_b32 v137, v158, v156, s5 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v130 + ; GCN-NEXT: v_fma_f32 v30, s4, v30, -v130 + ; GCN-NEXT: v_pack_b32_f16 v145, v144, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_pack_b32_f16 v144, v162, v140 + ; GCN-NEXT: v_fma_f32 v162, s4, v35, -v130 + ; GCN-NEXT: v_exp_f32_e32 v35, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v163 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v36, v136 + ; GCN-NEXT: v_perm_b32 v136, v154, v152, s5 + ; GCN-NEXT: v_perm_b32 v140, v154, v152, s8 + ; GCN-NEXT: v_mul_f32_e32 v154, 0x3fb8aa3b, v162 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b64 v131, v[136:137] + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_perm_b32 v142, v155, v153, s5 + ; GCN-NEXT: v_perm_b32 v152, v155, v153, s8 + ; GCN-NEXT: v_perm_b32 v141, v158, v156, s8 + ; GCN-NEXT: v_perm_b32 v143, v159, v157, s5 + ; GCN-NEXT: v_perm_b32 v153, v159, v157, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] + ; GCN-NEXT: v_fma_f32 v147, s4, v37, -v130 + ; GCN-NEXT: v_exp_f32_e32 v37, v154 + ; GCN-NEXT: v_fma_f32 v164, s4, v38, -v130 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v132, v[140:141] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v133, v[142:143] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v134, v[152:153] + ; GCN-NEXT: ; implicit-def: $vgpr137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v163, v34 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[138:139], v[144:145], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v38, v136 + ; GCN-NEXT: ; implicit-def: $vgpr136 + ; GCN-NEXT: ; implicit-def: $vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v35 + ; GCN-NEXT: v_add_u32_e32 v139, v128, v139 + ; GCN-NEXT: v_add_u32_e32 v136, v128, v136 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[152:153], v139, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[154:155], v136, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v136, v128, v137 + ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v136, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v136, v128, v138 + ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v136, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ds_read_b128 v[136:139], v135 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[148:149], v[160:161], v[112:127] + ; GCN-NEXT: v_cvt_f16_f32_e32 v162, v36 + ; GCN-NEXT: v_cvt_f16_f32_e32 v148, v37 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v147 + ; GCN-NEXT: v_fma_f32 v149, s4, v39, -v130 + ; GCN-NEXT: v_exp_f32_e32 v39, v147 + ; GCN-NEXT: v_pack_b32_f16 v161, v162, v148 + ; GCN-NEXT: v_pack_b32_f16 v160, v163, v146 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[150:151], v[144:145], v[112:127] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v164 + ; GCN-NEXT: ds_read_b128 v[140:143], v135 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v145, s4, v40, -v130 + ; GCN-NEXT: v_exp_f32_e32 v40, v144 + ; GCN-NEXT: v_fma_f32 v163, s4, v41, -v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v162, v38 + ; GCN-NEXT: v_fma_f32 v31, s4, v31, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[136:137], v[160:161], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v149 + ; GCN-NEXT: v_exp_f32_e32 v41, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v145 + ; GCN-NEXT: ds_read_b128 v[144:147], v135 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[148:151], v135 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v137, v39 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[160:161], v[80:95] + ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v40 + ; GCN-NEXT: v_fma_f32 v141, s4, v42, -v130 + ; GCN-NEXT: v_exp_f32_e32 v42, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v41 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v130 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[144:145], v[160:161], v[96:111] + ; GCN-NEXT: v_pack_b32_f16 v145, v140, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v163 + ; GCN-NEXT: v_pack_b32_f16 v144, v162, v137 + ; GCN-NEXT: v_fma_f32 v137, s4, v43, -v130 + ; GCN-NEXT: v_exp_f32_e32 v43, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_cvt_f16_f32_e32 v162, v42 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[138:139], v[144:145], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v163, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v164, v43 + ; GCN-NEXT: v_fma_f32 v4, s4, v4, -v130 + ; GCN-NEXT: v_fma_f32 v5, s4, v5, -v130 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v130 + ; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[144:145], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v165, v136 + ; GCN-NEXT: ds_read_b128 v[136:139], v57 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v130 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v130 + ; GCN-NEXT: v_fma_f32 v14, s4, v14, -v130 + ; GCN-NEXT: v_fma_f32 v15, s4, v15, -v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[146:147], v[144:145], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v166, v44 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v163 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v165 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[148:149], v[160:161], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v149, v146, v44 + ; GCN-NEXT: v_mul_f32_e32 v44, 0x3fb8aa3b, v46 + ; GCN-NEXT: v_exp_f32_e32 v160, v45 + ; GCN-NEXT: v_pack_b32_f16 v148, v162, v164 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[150:151], v[144:145], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v161, v44 + ; GCN-NEXT: v_mul_f32_e32 v44, 0x3fb8aa3b, v47 + ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v166 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[136:137], v[148:149], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v162, v44 + ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v137, v161 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v160 + ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_pack_b32_f16 v136, v150, v136 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v164, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v162 + ; GCN-NEXT: v_fma_f32 v140, s4, v19, -v130 + ; GCN-NEXT: v_perm_b32 v19, v158, v156, s8 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v140 + ; GCN-NEXT: v_pack_b32_f16 v137, v137, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[44:45], v[148:149], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v167, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_perm_b32 v17, v158, v156, s5 + ; GCN-NEXT: v_perm_b32 v18, v154, v152, s8 + ; GCN-NEXT: v_perm_b32 v44, v155, v153, s5 + ; GCN-NEXT: v_perm_b32 v45, v159, v157, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v168, v164 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[138:139], v[136:137], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v169, v16 + ; GCN-NEXT: v_perm_b32 v16, v154, v152, s5 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b64 v131, v[16:17] + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_perm_b32 v138, v155, v153, s8 + ; GCN-NEXT: v_perm_b32 v139, v159, v157, s8 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v132, v[18:19] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[136:137], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v155, v140 + ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr18 + ; GCN-NEXT: ; implicit-def: $vgpr19 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v133, v[44:45] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v134, v[138:139] + ; GCN-NEXT: v_add_u32_e32 v19, v128, v19 + ; GCN-NEXT: v_add_u32_e32 v17, v128, v17 + ; GCN-NEXT: v_add_u32_e32 v18, v128, v18 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v19, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[46:47], v[136:137], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v156, v16 + ; GCN-NEXT: ; implicit-def: $vgpr16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v167 + ; GCN-NEXT: v_add_u32_e32 v16, v128, v16 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v16, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[150:151], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[152:153], v18, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ds_read_b128 v[16:19], v135 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v169 + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v155 + ; GCN-NEXT: v_pack_b32_f16 v144, v168, v154 + ; GCN-NEXT: ds_read_b128 v[44:47], v135 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v148, v21 + ; GCN-NEXT: v_pack_b32_f16 v145, v20, v128 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[136:137], v[112:127] + ; GCN-NEXT: v_fma_f32 v21, s4, v24, -v130 + ; GCN-NEXT: v_exp_f32_e32 v146, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v156 + ; GCN-NEXT: v_fma_f32 v147, s4, v25, -v130 + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v146 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[144:145], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_exp_f32_e32 v149, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v21 + ; GCN-NEXT: ds_read_b128 v[20:23], v135 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[136:139], v135 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v148 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[144:145], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v154, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v149 + ; GCN-NEXT: v_pack_b32_f16 v25, v24, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v147 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[144:145], v[96:111] + ; GCN-NEXT: v_pack_b32_f16 v24, v128, v17 + ; GCN-NEXT: v_fma_f32 v17, s4, v27, -v130 + ; GCN-NEXT: v_exp_f32_e32 v128, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v26, v154 + ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[24:25], v[64:79] + ; GCN-NEXT: v_fma_f32 v18, s4, v28, -v130 + ; GCN-NEXT: v_exp_f32_e32 v147, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_fma_f32 v28, s4, v29, -v130 + ; GCN-NEXT: v_mul_f32_e32 v28, 0x3fb8aa3b, v28 + ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v147 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[24:25], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v46, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v46 + ; GCN-NEXT: v_pack_b32_f16 v45, v29, v44 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[24:25], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v47, v16 + ; GCN-NEXT: ds_read_b128 v[16:19], v57 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[20:23], v57 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v44, v26, v27 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[136:137], v[144:145], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v136, v28 + ; GCN-NEXT: v_cvt_f16_f32_e32 v137, v47 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[138:139], v[24:25], v[112:127] + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v30 + ; GCN-NEXT: v_exp_f32_e32 v138, v24 + ; GCN-NEXT: ds_read_b128 v[24:27], v57 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[44:45], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v31 + ; GCN-NEXT: v_exp_f32_e32 v139, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v136 + ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[44:45], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v144, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v139 + ; GCN-NEXT: v_pack_b32_f16 v20, v137, v17 + ; GCN-NEXT: v_perm_b32 v17, v153, v151, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v145, v144 + ; GCN-NEXT: v_pack_b32_f16 v21, v16, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[24:25], v[44:45], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v137, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v130 + ; GCN-NEXT: v_perm_b32 v2, v142, v140, s8 + ; GCN-NEXT: v_perm_b32 v16, v143, v141, s5 + ; GCN-NEXT: v_perm_b32 v1, v152, v150, s5 + ; GCN-NEXT: v_perm_b32 v3, v152, v150, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[20:21], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v157, v0 + ; GCN-NEXT: v_perm_b32 v0, v142, v140, s5 + ; GCN-NEXT: v_perm_b32 v18, v143, v141, s8 + ; GCN-NEXT: v_perm_b32 v19, v153, v151, s8 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b64 v131, v[0:1] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v132, v[2:3] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v133, v[16:17] + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b64 v134, v[18:19] + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v4 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[20:21], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v140, v24 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v157 + ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v140 + ; GCN-NEXT: v_pack_b32_f16 v24, v145, v22 + ; GCN-NEXT: v_pack_b32_f16 v25, v4, v23 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[26:27], v[20:21], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v26, v0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[0:3], v135 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v6 + ; GCN-NEXT: ds_read_b128 v[16:19], v135 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[28:29], v[44:45], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v27, v5 + ; GCN-NEXT: v_fma_f32 v5, s4, v8, -v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v26 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[30:31], v[20:21], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v29, v4 + ; GCN-NEXT: v_fma_f32 v30, s4, v9, -v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v8, v29 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[24:25], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_exp_f32_e32 v31, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v5 + ; GCN-NEXT: ds_read_b128 v[4:7], v135 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[20:23], v135 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v27 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[16:17], v[24:25], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v16, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v31 + ; GCN-NEXT: v_pack_b32_f16 v9, v8, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[24:25], v[96:111] + ; GCN-NEXT: v_pack_b32_f16 v8, v28, v1 + ; GCN-NEXT: v_fma_f32 v1, s4, v11, -v130 + ; GCN-NEXT: v_exp_f32_e32 v17, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 + ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[8:9], v[64:79] + ; GCN-NEXT: v_fma_f32 v2, s4, v12, -v130 + ; GCN-NEXT: v_exp_f32_e32 v28, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_fma_f32 v12, s4, v13, -v130 + ; GCN-NEXT: v_mul_f32_e32 v12, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[18:19], v[8:9], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v18, v0 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[8:9], v[96:111] + ; GCN-NEXT: v_exp_f32_e32 v19, v0 + ; GCN-NEXT: ds_read_b128 v[0:3], v57 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[4:7], v57 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[20:21], v[24:25], v[112:127] + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v18 + ; GCN-NEXT: v_exp_f32_e32 v21, v12 + ; GCN-NEXT: v_pack_b32_f16 v12, v10, v11 + ; GCN-NEXT: v_pack_b32_f16 v13, v13, v20 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[22:23], v[8:9], v[112:127] + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v14 + ; GCN-NEXT: v_exp_f32_e32 v20, v8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v19 + ; GCN-NEXT: ds_read_b128 v[8:11], v57 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[12:13], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v15 + ; GCN-NEXT: v_exp_f32_e32 v22, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v15, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v22 + ; GCN-NEXT: v_pack_b32_f16 v1, v0, v1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[4:5], v[12:13], v[80:95] + ; GCN-NEXT: v_add_f32_e32 v4, 0, v49 + ; GCN-NEXT: v_add_f32_e32 v4, v50, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v51, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v52, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v53, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v54, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v55, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v56, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v58, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v59, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v60, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v61, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v62, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v63, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v32, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v33, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v34, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v35, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v36, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v37, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v38, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v39, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v40, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v41, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v42, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v43, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v163, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v165, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v166, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v160, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v161, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v162, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v164, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v167, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v169, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v155, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v156, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v148, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v146, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v149, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v154, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v128, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v147, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v46, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v47, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v136, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v138, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v139, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v144, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v137, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v157, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v140, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v26, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v27, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v29, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v31, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v16, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v17, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v28, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v18, v4 + ; GCN-NEXT: v_pack_b32_f16 v0, v14, v15 + ; GCN-NEXT: v_add_f32_e32 v4, v19, v4 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[8:9], v[12:13], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79] + ; GCN-NEXT: v_add_f32_e32 v2, v21, v4 + ; GCN-NEXT: v_add_f32_e32 v2, v20, v2 + ; GCN-NEXT: v_add_f32_e32 v2, v22, v2 + ; GCN-NEXT: ds_bpermute_b32 v3, v129, v2 + ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 + ; GCN-NEXT: ds_bpermute_b32 v3, v129, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[6:7], v[0:1], v[80:95] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 + ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: s_endpgm + attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} + !0 = !{i64 2862105} + +... + +--- +name: largeInterleave +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + occupancy: 7 +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4 + %11:vgpr_32 = IMPLICIT_DEF + %1:sgpr_512 = IMPLICIT_DEF + %16:vgpr_32 = IMPLICIT_DEF + %443:sgpr_128 = IMPLICIT_DEF + %18:sreg_32 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %23:vgpr_32 = IMPLICIT_DEF + %391:vreg_128_align2 = IMPLICIT_DEF + %24:vgpr_32 = IMPLICIT_DEF + %392:vreg_128_align2 = IMPLICIT_DEF + %401:vreg_128_align2 = IMPLICIT_DEF + %406:vreg_128_align2 = IMPLICIT_DEF + %48:vgpr_32 = IMPLICIT_DEF + %473:sgpr_128 = IMPLICIT_DEF + %411:vreg_128_align2 = IMPLICIT_DEF + %416:vreg_128_align2 = IMPLICIT_DEF + %421:vreg_128_align2 = IMPLICIT_DEF + %426:vreg_128_align2 = IMPLICIT_DEF + %1114:sgpr_32 = IMPLICIT_DEF + %39:vgpr_32 = IMPLICIT_DEF + %484:sreg_64_xexec = IMPLICIT_DEF + %3346:vgpr_32 = IMPLICIT_DEF + %1422:sreg_32 = IMPLICIT_DEF + %1424:sreg_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %494:sreg_32 = IMPLICIT_DEF + %47:vgpr_32 = IMPLICIT_DEF + %41:vgpr_32 = IMPLICIT_DEF + %42:vgpr_32 = IMPLICIT_DEF + %43:vgpr_32 = IMPLICIT_DEF + %44:vgpr_32 = IMPLICIT_DEF + %45:vgpr_32 = IMPLICIT_DEF + %50:sreg_32 = IMPLICIT_DEF + %3347:vgpr_32 = IMPLICIT_DEF + %3329:vgpr_32 = IMPLICIT_DEF + %3330:vgpr_32 = IMPLICIT_DEF + %3331:vgpr_32 = IMPLICIT_DEF + %3332:vgpr_32 = IMPLICIT_DEF + %3333:vgpr_32 = IMPLICIT_DEF + %2986:vreg_512_align2 = IMPLICIT_DEF + %3038:vreg_512_align2 = IMPLICIT_DEF + %2980:vreg_512_align2 = IMPLICIT_DEF + %3003:vreg_512_align2 = IMPLICIT_DEF + %3334:vgpr_32 = IMPLICIT_DEF + %3335:vgpr_32 = IMPLICIT_DEF + %3336:vgpr_32 = IMPLICIT_DEF + %3337:vgpr_32 = IMPLICIT_DEF + %3338:vgpr_32 = IMPLICIT_DEF + %3339:vgpr_32 = IMPLICIT_DEF + %3345:vgpr_32 = IMPLICIT_DEF + %3340:vgpr_32 = IMPLICIT_DEF + %3341:vgpr_32 = IMPLICIT_DEF + %3342:vgpr_32 = IMPLICIT_DEF + %3343:vgpr_32 = IMPLICIT_DEF + %3344:vgpr_32 = IMPLICIT_DEF + %84:vgpr_32 = COPY %3347 + %86:vgpr_32 = COPY %3347:vgpr_32 + IGLP_OPT 2 + %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec + %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec + %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec + %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec + %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec + %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec + %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec + %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc + %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec + DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec + %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec + %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec + %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec + early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec + early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec + early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec + early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec + %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec + %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec + %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec + %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec + %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec + %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec + %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec + %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec + %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec + %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec + %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec + %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec + %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec + %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec + %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec + %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec + %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec + %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec + %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec + %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec + %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec + %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec + %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec + %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec + %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec + %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec + %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec + %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec + %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec + %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec + %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec + %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec + %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec + %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec + %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec + %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec + %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec + %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec + %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec + %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec + %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec + %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec + %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec + %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec + %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec + %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec + %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec + %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec + %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec + %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec + %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec + %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec + %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec + %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec + %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec + %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec + %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec + %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec + %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec + %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec + %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec + %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec + %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec + %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec + %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec + %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec + %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec + %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec + %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec + %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec + %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec + %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec + %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec + %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec + %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec + %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec + %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec + %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec + %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec + %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec + %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec + %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec + %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec + %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec + %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec + %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec + %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec + %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec + %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec + %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec + %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec + %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec + %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec + %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec + %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec + %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec + %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec + %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec + %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec + %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec + %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec + %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec + %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec + %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec + %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec + %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec + %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec + %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec + %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec + %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec + %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec + %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec + %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec + %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec + %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec + %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec + %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec + %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec + %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec + %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec + %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec + %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec + %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec + %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec + %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec + %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec + %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec + %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec + %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec + %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec + %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec + %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec + %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec + %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec + %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec + %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec + %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec + %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec + %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec + %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec + %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec + %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec + %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec + %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec + %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec + %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec + %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec + %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec + %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec + %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec + %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec + %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec + %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec + %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec + %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec + %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec + %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec + %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec + %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec + %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec + %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec + %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec + %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec + %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec + %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec + %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec + %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec + %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec + %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec + %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec + %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec + %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec + %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec + %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec + %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec + %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec + %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec + %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec + %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec + %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec + %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec + %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec + %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec + %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec + %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec + %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec + %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec + %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec + %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec + %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec + %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec + %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec + %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec + %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec + %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec + %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec + %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec + %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec + %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec + %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec + %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec + %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec + %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec + %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec + %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec + %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec + %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec + %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec + %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec + %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec + %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec + %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec + %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec + %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec + %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec + %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec + %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec + %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec + %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec + %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec + %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec + %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec + %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec + %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec + %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec + %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec + %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec + %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec + %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec + %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec + %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec + %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec + %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec + %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec + %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec + %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec + %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec + %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec + %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec + %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec + %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec + %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec + %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec + %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec + %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec + %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec + %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec + %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec + %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec + %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec + %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec + %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec + %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec + %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec + %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec + %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec + %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec + %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec + %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec + %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec + %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec + %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec + %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec + %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec + %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec + %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec + %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec + %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec + %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec + %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec + %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec + %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec + %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec + %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec + %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec + %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec + %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec + %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec + %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec + %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec + %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec + undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec + %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec + %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec + %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec + DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec + %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec + DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec + %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec + DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec + %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec + DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec + %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec + %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec + %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec + %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec + %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec + %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec + %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec + %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec + %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec + %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec + %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec + %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec + %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec + %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec + %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec + %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec + %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec + %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec + %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec + %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec + %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec + %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec + %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec + %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec + %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec + %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec + %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec + %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec + %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec + %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec + %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec + %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec + %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec + %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec + %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec + %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec + %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec + %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec + %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec + %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec + %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec + %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec + %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec + %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec + %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec + %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec + %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec + %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec + %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec + %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec + %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec + %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec + %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec + %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec + %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec + %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec + %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec + %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec + %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec + %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec + %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec + %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec + %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec + %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec + %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec + %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec + %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec + %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec + %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec + %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec + %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec + %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec + %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec + %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec + undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec + %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec + DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec + DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec + %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec + %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec + %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec + %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec + %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec + %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec + %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec + %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec + %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec + %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec + %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec + %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec + undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec + %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec + %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec + %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec + %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec + %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec + %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec + %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec + %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec + %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec + %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec + %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec + %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec + %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec + %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec + %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec + %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec + %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec + %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec + %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec + %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec + %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec + %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec + %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec + %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec + %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec + %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec + %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec + %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec + %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec + %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec + %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec + %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec + %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir new file mode 100644 index 0000000000000..3e467af66590a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -0,0 +1,900 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @smallInterleave() #0 { ret void } + ; GCN-LABEL: smallInterleave: + ; GCN: ; %bb.0: + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN-NEXT: v_readfirstlane_b32 s20, v2 + ; GCN-NEXT: ; implicit-def: $sgpr4 + ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr54 + ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr37 + ; GCN-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr55 + ; GCN-NEXT: ; implicit-def: $vgpr88 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 + ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] + ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: s_lshl_b32 s4, s20, 7 + ; GCN-NEXT: ; implicit-def: $vgpr5 + ; GCN-NEXT: v_add_lshl_u32 v36, v5, s4, 1 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v36, v[0:3] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx4 v[16:19], v4, s[0:3], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v54 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v54 + ; GCN-NEXT: buffer_load_dwordx2 v[24:25], v0, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[26:27], v1, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ds_read_b128 v[20:23], v37 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[32:35], v37 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; kill: killed $vgpr1 + ; GCN-NEXT: ; kill: killed $vgpr0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[20:21], v[28:29], 0 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: ; implicit-def: $sgpr3 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[28:29], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[22:23], v[30:31], v[38:53] + ; GCN-NEXT: ds_read_b128 v[20:23], v55 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[30:31], v[0:15] + ; GCN-NEXT: ds_read_b128 v[32:35], v55 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b128 v36, v[16:19] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[20:21], v[28:29], v[38:53] + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[16:19], v37 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr36 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[28:29], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr32 + ; GCN-NEXT: ; implicit-def: $vgpr33 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[22:23], v[30:31], v[38:53] + ; GCN-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22_vgpr23 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[30:31], v[0:15] + ; GCN-NEXT: ds_read_b128 v[28:31], v37 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: ; implicit-def: $vgpr35 + ; GCN-NEXT: ; implicit-def: $vgpr37 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[28:29], v[20:21], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[16:17], v[20:21], v[38:53] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[30:31], v[22:23], v[0:15] + ; GCN-NEXT: ds_read_b128 v[28:31], v55 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[18:19], v[22:23], v[38:53] + ; GCN-NEXT: ds_read_b128 v[20:23], v55 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $vgpr55 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[28:29], v[16:17], v[38:53] + ; GCN-NEXT: ; implicit-def: $vgpr29 + ; GCN-NEXT: ; implicit-def: $vgpr28 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[20:21], v[16:17], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v21, s20, v29 + ; GCN-NEXT: v_and_b32_e32 v21, 0x1fffffff, v21 + ; GCN-NEXT: v_mul_lo_u32 v21, v21, s6 + ; GCN-NEXT: v_add_lshl_u32 v68, v32, v21, 1 + ; GCN-NEXT: v_lshl_add_u32 v73, v33, 1, v68 + ; GCN-NEXT: v_lshl_add_u32 v74, v34, 1, v73 + ; GCN-NEXT: v_lshl_add_u32 v75, v35, 1, v74 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[38:53], v[30:31], v[18:19], v[38:53] + ; GCN-NEXT: v_perm_b32 v21, v26, v24, s2 + ; GCN-NEXT: v_perm_b32 v24, v26, v24, s3 + ; GCN-NEXT: v_perm_b32 v26, v27, v25, s2 + ; GCN-NEXT: v_perm_b32 v25, v27, v25, s3 + ; GCN-NEXT: v_add_u32_e32 v17, v36, v54 + ; GCN-NEXT: v_add_u32_e32 v20, v37, v54 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b32 v68, v21 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v73, v24 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v74, v26 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v75, v25 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[56:57], v17, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[58:59], v20, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[22:23], v[18:19], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v18, s4, v38 + ; GCN-NEXT: v_mul_f32_e32 v19, s4, v39 + ; GCN-NEXT: v_mul_f32_e32 v22, s4, v40 + ; GCN-NEXT: v_mul_f32_e32 v23, s4, v41 + ; GCN-NEXT: v_max3_f32 v18, v18, s5, v19 + ; GCN-NEXT: v_mul_f32_e32 v27, s4, v42 + ; GCN-NEXT: v_mul_f32_e32 v29, s4, v43 + ; GCN-NEXT: v_max3_f32 v18, v18, v22, v23 + ; GCN-NEXT: v_mul_f32_e32 v30, s4, v44 + ; GCN-NEXT: v_mul_f32_e32 v31, s4, v45 + ; GCN-NEXT: v_max3_f32 v18, v18, v27, v29 + ; GCN-NEXT: v_mul_f32_e32 v32, s4, v46 + ; GCN-NEXT: v_mul_f32_e32 v33, s4, v47 + ; GCN-NEXT: v_max3_f32 v18, v18, v30, v31 + ; GCN-NEXT: v_mul_f32_e32 v34, s4, v48 + ; GCN-NEXT: v_mul_f32_e32 v35, s4, v49 + ; GCN-NEXT: v_max3_f32 v18, v18, v32, v33 + ; GCN-NEXT: v_mul_f32_e32 v36, s4, v50 + ; GCN-NEXT: v_mul_f32_e32 v19, s4, v51 + ; GCN-NEXT: v_max3_f32 v18, v18, v34, v35 + ; GCN-NEXT: v_mul_f32_e32 v22, s4, v52 + ; GCN-NEXT: v_mul_f32_e32 v23, s4, v53 + ; GCN-NEXT: v_max3_f32 v18, v18, v36, v19 + ; GCN-NEXT: v_mul_f32_e32 v27, s4, v0 + ; GCN-NEXT: v_mul_f32_e32 v29, s4, v1 + ; GCN-NEXT: v_max3_f32 v18, v18, v22, v23 + ; GCN-NEXT: v_mul_f32_e32 v30, s4, v2 + ; GCN-NEXT: v_mul_f32_e32 v31, s4, v3 + ; GCN-NEXT: v_max3_f32 v18, v18, v27, v29 + ; GCN-NEXT: v_mul_f32_e32 v32, s4, v4 + ; GCN-NEXT: v_mul_f32_e32 v33, s4, v5 + ; GCN-NEXT: v_max3_f32 v18, v18, v30, v31 + ; GCN-NEXT: v_mul_f32_e32 v34, s4, v6 + ; GCN-NEXT: v_mul_f32_e32 v35, s4, v7 + ; GCN-NEXT: v_max3_f32 v18, v18, v32, v33 + ; GCN-NEXT: v_mul_f32_e32 v19, s4, v8 + ; GCN-NEXT: v_mul_f32_e32 v36, s4, v9 + ; GCN-NEXT: v_max3_f32 v18, v18, v34, v35 + ; GCN-NEXT: v_mul_f32_e32 v22, s4, v10 + ; GCN-NEXT: v_mul_f32_e32 v23, s4, v11 + ; GCN-NEXT: v_max3_f32 v18, v18, v19, v36 + ; GCN-NEXT: v_mul_f32_e32 v27, s4, v12 + ; GCN-NEXT: v_mul_f32_e32 v29, s4, v13 + ; GCN-NEXT: v_max3_f32 v18, v18, v22, v23 + ; GCN-NEXT: v_mul_f32_e32 v30, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v31, s4, v15 + ; GCN-NEXT: v_max3_f32 v18, v18, v27, v29 + ; GCN-NEXT: v_max3_f32 v18, v18, v30, v31 + ; GCN-NEXT: ds_bpermute_b32 v19, v55, v18 + ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: v_max_f32_e32 v16, v28, v28 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[78:81], v88 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; kill: killed $vgpr20 + ; GCN-NEXT: ds_read_b128 v[82:85], v88 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v19, v19, v19 + ; GCN-NEXT: v_max_f32_e32 v18, v18, v19 + ; GCN-NEXT: ds_bpermute_b32 v19, v55, v18 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v17, v19, v18, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v17, v17, v17 + ; GCN-NEXT: v_max_f32_e32 v76, v16, v17 + ; GCN-NEXT: v_fma_f32 v16, s4, v38, -v76 + ; GCN-NEXT: v_fma_f32 v17, s4, v39, -v76 + ; GCN-NEXT: v_fma_f32 v18, s4, v40, -v76 + ; GCN-NEXT: v_fma_f32 v19, s4, v41, -v76 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 + ; GCN-NEXT: v_exp_f32_e32 v60, v16 + ; GCN-NEXT: v_exp_f32_e32 v61, v17 + ; GCN-NEXT: v_exp_f32_e32 v62, v18 + ; GCN-NEXT: v_exp_f32_e32 v63, v19 + ; GCN-NEXT: v_fma_f32 v20, s4, v42, -v76 + ; GCN-NEXT: v_sub_f32_e32 v26, v28, v76 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_fma_f32 v22, s4, v44, -v76 + ; GCN-NEXT: v_fma_f32 v23, s4, v45, -v76 + ; GCN-NEXT: v_exp_f32_e32 v64, v20 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v60 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v61 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v63 + ; GCN-NEXT: v_exp_f32_e32 v54, v20 + ; GCN-NEXT: v_fma_f32 v21, s4, v43, -v76 + ; GCN-NEXT: v_exp_f32_e32 v66, v22 + ; GCN-NEXT: v_exp_f32_e32 v67, v23 + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 + ; GCN-NEXT: v_fma_f32 v24, s4, v46, -v76 + ; GCN-NEXT: v_fma_f32 v25, s4, v47, -v76 + ; GCN-NEXT: v_exp_f32_e32 v65, v21 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v76 + ; GCN-NEXT: v_pack_b32_f16 v87, v18, v19 + ; GCN-NEXT: v_pack_b32_f16 v86, v16, v17 + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_exp_f32_e32 v71, v48 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[86:87], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v65 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v25 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: v_pack_b32_f16 v79, v78, v48 + ; GCN-NEXT: v_fma_f32 v48, s4, v49, -v76 + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_exp_f32_e32 v69, v69 + ; GCN-NEXT: v_exp_f32_e32 v70, v70 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31] + ; GCN-NEXT: v_pack_b32_f16 v78, v72, v77 + ; GCN-NEXT: v_exp_f32_e32 v72, v48 + ; GCN-NEXT: v_fma_f32 v48, s4, v50, -v76 + ; GCN-NEXT: v_fma_f32 v49, s4, v51, -v76 + ; GCN-NEXT: v_fma_f32 v52, s4, v52, -v76 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_mul_f32_e32 v49, 0x3fb8aa3b, v49 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[78:79], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v69 + ; GCN-NEXT: v_exp_f32_e32 v51, v48 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v70 + ; GCN-NEXT: v_exp_f32_e32 v50, v49 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v52, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[78:79], v[16:31] + ; GCN-NEXT: v_pack_b32_f16 v86, v77, v80 + ; GCN-NEXT: v_fma_f32 v53, s4, v53, -v76 + ; GCN-NEXT: v_pack_b32_f16 v87, v49, v52 + ; GCN-NEXT: ; implicit-def: $vgpr52 + ; GCN-NEXT: ds_read_b128 v[78:81], v52 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[82:85], v52 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v49, 0x3fb8aa3b, v53 + ; GCN-NEXT: v_exp_f32_e32 v48, v48 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v76 + ; GCN-NEXT: v_exp_f32_e32 v49, v49 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_exp_f32_e32 v77, v1 + ; GCN-NEXT: v_fma_f32 v1, s4, v2, -v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[86:87], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v48 + ; GCN-NEXT: v_exp_f32_e32 v78, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v49 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v53, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 + ; GCN-NEXT: v_pack_b32_f16 v1, v2, v1 + ; GCN-NEXT: v_fma_f32 v2, s4, v3, -v76 + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_pack_b32_f16 v0, v79, v0 + ; GCN-NEXT: v_exp_f32_e32 v79, v2 + ; GCN-NEXT: v_fma_f32 v2, s4, v4, -v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[0:1], v[32:47] + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_fma_f32 v3, s4, v5, -v76 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v53 + ; GCN-NEXT: v_exp_f32_e32 v4, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[0:1], v[16:31] + ; GCN-NEXT: v_perm_b32 v0, v58, v56, s2 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: ds_write_b32 v68, v0 + ; GCN-NEXT: v_fma_f32 v0, s4, v6, -v76 + ; GCN-NEXT: v_perm_b32 v1, v58, v56, s3 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v73, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v78 + ; GCN-NEXT: v_exp_f32_e32 v58, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v79 + ; GCN-NEXT: v_exp_f32_e32 v5, v3 + ; GCN-NEXT: v_perm_b32 v3, v59, v57, s2 + ; GCN-NEXT: v_perm_b32 v56, v59, v57, s3 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v74, v3 + ; GCN-NEXT: buffer_wbl2 sc0 sc1 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_write_b32 v75, v56 + ; GCN-NEXT: ;;#ASMSTART + ; GCN-NEXT: s_waitcnt vmcnt(8) + ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_pack_b32_f16 v57, v1, v0 + ; GCN-NEXT: v_pack_b32_f16 v56, v80, v2 + ; GCN-NEXT: v_fma_f32 v0, s4, v7, -v76 + ; GCN-NEXT: v_fma_f32 v1, s4, v8, -v76 + ; GCN-NEXT: v_fma_f32 v2, s4, v9, -v76 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[6:9], v88 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_exp_f32_e32 v59, v0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[56:57], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v7, v1 + ; GCN-NEXT: v_exp_f32_e32 v73, v2 + ; GCN-NEXT: ds_read_b128 v[0:3], v88 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v4 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v5 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[56:57], v[16:31] + ; GCN-NEXT: v_fma_f32 v0, s4, v10, -v76 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 + ; GCN-NEXT: v_exp_f32_e32 v10, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v59 + ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN-NEXT: v_pack_b32_f16 v0, v6, v68 + ; GCN-NEXT: v_fma_f32 v6, s4, v11, -v76 + ; GCN-NEXT: v_fma_f32 v11, s4, v14, -v76 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[8:9], v[0:1], v[32:47] + ; GCN-NEXT: v_fma_f32 v8, s4, v12, -v76 + ; GCN-NEXT: v_fma_f32 v12, s4, v15, -v76 + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 + ; GCN-NEXT: v_exp_f32_e32 v6, v6 + ; GCN-NEXT: v_fma_f32 v9, s4, v13, -v76 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_exp_f32_e32 v14, v8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[0:1], v[16:31] + ; GCN-NEXT: v_add_f32_e32 v0, 0, v60 + ; GCN-NEXT: v_add_f32_e32 v0, v61, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v62, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v63, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v66, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v69, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v70, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v71, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v51, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v50, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v48, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v49, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v53, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v11 + ; GCN-NEXT: v_add_f32_e32 v0, v4, v0 + ; GCN-NEXT: v_exp_f32_e32 v11, v1 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_add_f32_e32 v0, v5, v0 + ; GCN-NEXT: v_exp_f32_e32 v12, v1 + ; GCN-NEXT: v_add_f32_e32 v4, v58, v0 + ; GCN-NEXT: ds_read_b128 v[0:3], v52 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 + ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v15, v73 + ; GCN-NEXT: v_exp_f32_e32 v56, v8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 + ; GCN-NEXT: v_cvt_f16_f32_e32 v9, v6 + ; GCN-NEXT: v_add_f32_e32 v4, v59, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v7, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v73, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v10, v4 + ; GCN-NEXT: v_pack_b32_f16 v9, v8, v9 + ; GCN-NEXT: v_pack_b32_f16 v8, v13, v15 + ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[8:9], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v11 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v14 + ; GCN-NEXT: v_add_f32_e32 v4, v14, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v56, v4 + ; GCN-NEXT: v_add_f32_e32 v10, v11, v4 + ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN-NEXT: v_pack_b32_f16 v0, v6, v5 + ; GCN-NEXT: ds_read_b128 v[4:7], v52 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[8:9], v[16:31] + ; GCN-NEXT: v_mov_b32_e32 v4, 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v2, v12, v10 + ; GCN-NEXT: ds_bpermute_b32 v3, v55, v2 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 + ; GCN-NEXT: ds_bpermute_b32 v3, v55, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[0:1], v[16:31] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] + ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v54 + ; GCN-NEXT: s_endpgm + attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} + + !0 = !{i64 2862105} + +... + +--- +name: smallInterleave +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4 + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = COPY %0:vgpr_32 + %2:vgpr_32 = IMPLICIT_DEF + %3:sreg_32 = IMPLICIT_DEF + %4:vreg_64_align2 = IMPLICIT_DEF + %5:sgpr_128 = IMPLICIT_DEF + %6:vgpr_32 = IMPLICIT_DEF + %7:vgpr_32 = IMPLICIT_DEF + %8:sgpr_128 = IMPLICIT_DEF + %9:vgpr_32 = IMPLICIT_DEF + %10:sgpr_512 = IMPLICIT_DEF + %11:sgpr_32 = IMPLICIT_DEF + %12:sreg_64_xexec = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %14:sreg_32 = IMPLICIT_DEF + %15:sreg_32 = IMPLICIT_DEF + %16:vgpr_32 = IMPLICIT_DEF + %17:sreg_32 = IMPLICIT_DEF + %18:vgpr_32 = IMPLICIT_DEF + %19:vgpr_32 = IMPLICIT_DEF + %20:vgpr_32 = IMPLICIT_DEF + %21:vgpr_32 = IMPLICIT_DEF + %22:vgpr_32 = IMPLICIT_DEF + %23:vgpr_32 = IMPLICIT_DEF + %24:vgpr_32 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %26:sreg_32 = IMPLICIT_DEF + %42:vgpr_32 = IMPLICIT_DEF + %44:vreg_128_align2 = IMPLICIT_DEF + %48:vgpr_32 = IMPLICIT_DEF + %49:vreg_128_align2 = IMPLICIT_DEF + %52:vreg_128_align2 = IMPLICIT_DEF + %55:vreg_128_align2 = IMPLICIT_DEF + %106:vgpr_32 = IMPLICIT_DEF + %29:vgpr_32 = IMPLICIT_DEF + %37:vgpr_32 = IMPLICIT_DEF + %259:vreg_512_align2 = IMPLICIT_DEF + %260:vreg_512_align2 = IMPLICIT_DEF + IGLP_OPT 2 + %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec + %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec + %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec + %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec + %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc + %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec + DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec + %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec + %36:vgpr_32 = V_ADD_U32_e32 %7:vgpr_32, %37:vgpr_32, implicit $exec + %38:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %36:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec + %39:vgpr_32 = V_ADD_U32_e32 %9:vgpr_32, %37:vgpr_32, implicit $exec + %40:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %39:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %41:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec + early-clobber %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %41.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %41.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %45:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec + early-clobber %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %45.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %45.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %47:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %50:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + DS_WRITE_B128_gfx9 %34:vgpr_32, %35:vreg_128_align2, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %51:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %53:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %54:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %56:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %57:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub0:vreg_512_align2, implicit $mode, implicit $exec + %58:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub1:vreg_512_align2, implicit $mode, implicit $exec + %59:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub2:vreg_512_align2, implicit $mode, implicit $exec + %60:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub3:vreg_512_align2, implicit $mode, implicit $exec + %61:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub4:vreg_512_align2, implicit $mode, implicit $exec + %62:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub5:vreg_512_align2, implicit $mode, implicit $exec + %63:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub6:vreg_512_align2, implicit $mode, implicit $exec + %64:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub7:vreg_512_align2, implicit $mode, implicit $exec + %65:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub8:vreg_512_align2, implicit $mode, implicit $exec + %66:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub9:vreg_512_align2, implicit $mode, implicit $exec + %67:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub10:vreg_512_align2, implicit $mode, implicit $exec + %68:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub11:vreg_512_align2, implicit $mode, implicit $exec + %69:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub12:vreg_512_align2, implicit $mode, implicit $exec + %70:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub13:vreg_512_align2, implicit $mode, implicit $exec + %71:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub14:vreg_512_align2, implicit $mode, implicit $exec + %72:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub15:vreg_512_align2, implicit $mode, implicit $exec + %73:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub0:vreg_512_align2, implicit $mode, implicit $exec + %74:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub1:vreg_512_align2, implicit $mode, implicit $exec + %75:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub2:vreg_512_align2, implicit $mode, implicit $exec + %76:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub3:vreg_512_align2, implicit $mode, implicit $exec + %77:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub4:vreg_512_align2, implicit $mode, implicit $exec + %78:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub5:vreg_512_align2, implicit $mode, implicit $exec + %79:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub6:vreg_512_align2, implicit $mode, implicit $exec + %80:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub7:vreg_512_align2, implicit $mode, implicit $exec + %81:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub8:vreg_512_align2, implicit $mode, implicit $exec + %82:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub9:vreg_512_align2, implicit $mode, implicit $exec + %83:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub10:vreg_512_align2, implicit $mode, implicit $exec + %84:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub11:vreg_512_align2, implicit $mode, implicit $exec + %85:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub12:vreg_512_align2, implicit $mode, implicit $exec + %86:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub13:vreg_512_align2, implicit $mode, implicit $exec + %87:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub14:vreg_512_align2, implicit $mode, implicit $exec + %88:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub15:vreg_512_align2, implicit $mode, implicit $exec + %89:vgpr_32 = V_MAX3_F32_e64 0, %57:vgpr_32, 0, %11:sgpr_32, 0, %58:vgpr_32, 0, 0, implicit $mode, implicit $exec + %90:vgpr_32 = V_MAX3_F32_e64 0, %89:vgpr_32, 0, %59:vgpr_32, 0, %60:vgpr_32, 0, 0, implicit $mode, implicit $exec + %91:vgpr_32 = V_MAX3_F32_e64 0, %90:vgpr_32, 0, %61:vgpr_32, 0, %62:vgpr_32, 0, 0, implicit $mode, implicit $exec + %92:vgpr_32 = V_MAX3_F32_e64 0, %91:vgpr_32, 0, %63:vgpr_32, 0, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec + %93:vgpr_32 = V_MAX3_F32_e64 0, %92:vgpr_32, 0, %65:vgpr_32, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec + %94:vgpr_32 = V_MAX3_F32_e64 0, %93:vgpr_32, 0, %67:vgpr_32, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec + %95:vgpr_32 = V_MAX3_F32_e64 0, %94:vgpr_32, 0, %69:vgpr_32, 0, %70:vgpr_32, 0, 0, implicit $mode, implicit $exec + %96:vgpr_32 = V_MAX3_F32_e64 0, %95:vgpr_32, 0, %71:vgpr_32, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec + %97:vgpr_32 = V_MAX3_F32_e64 0, %96:vgpr_32, 0, %73:vgpr_32, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec + %98:vgpr_32 = V_MAX3_F32_e64 0, %97:vgpr_32, 0, %75:vgpr_32, 0, %76:vgpr_32, 0, 0, implicit $mode, implicit $exec + %99:vgpr_32 = V_MAX3_F32_e64 0, %98:vgpr_32, 0, %77:vgpr_32, 0, %78:vgpr_32, 0, 0, implicit $mode, implicit $exec + %100:vgpr_32 = V_MAX3_F32_e64 0, %99:vgpr_32, 0, %79:vgpr_32, 0, %80:vgpr_32, 0, 0, implicit $mode, implicit $exec + %101:vgpr_32 = V_MAX3_F32_e64 0, %100:vgpr_32, 0, %81:vgpr_32, 0, %82:vgpr_32, 0, 0, implicit $mode, implicit $exec + %102:vgpr_32 = V_MAX3_F32_e64 0, %101:vgpr_32, 0, %83:vgpr_32, 0, %84:vgpr_32, 0, 0, implicit $mode, implicit $exec + %103:vgpr_32 = V_MAX3_F32_e64 0, %102:vgpr_32, 0, %85:vgpr_32, 0, %86:vgpr_32, 0, 0, implicit $mode, implicit $exec + %104:vgpr_32 = V_MAX3_F32_e64 0, %103:vgpr_32, 0, %87:vgpr_32, 0, %88:vgpr_32, 0, 0, implicit $mode, implicit $exec + %105:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %104:vgpr_32, 0, implicit $exec + %107:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %105:vgpr_32, %105:vgpr_32, implicit $mode, implicit $exec + %108:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %104:vgpr_32, %107:vgpr_32, implicit $mode, implicit $exec + %109:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %108:vgpr_32, 0, implicit $exec + %110:vgpr_32 = V_CNDMASK_B32_e64 0, %109:vgpr_32, 0, %108:vgpr_32, %12:sreg_64_xexec, implicit $exec + %111:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %110:vgpr_32, %110:vgpr_32, implicit $mode, implicit $exec + %112:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %13:vgpr_32, %13:vgpr_32, implicit $mode, implicit $exec + %113:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %112:vgpr_32, %111:vgpr_32, implicit $mode, implicit $exec + %114:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %115:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %114:vgpr_32, implicit $mode, implicit $exec + %116:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %115:vgpr_32, implicit $mode, implicit $exec + %117:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %118:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %117:vgpr_32, implicit $mode, implicit $exec + %119:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %118:vgpr_32, implicit $mode, implicit $exec + %120:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %121:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %120:vgpr_32, implicit $mode, implicit $exec + %122:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %121:vgpr_32, implicit $mode, implicit $exec + %123:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %124:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %123:vgpr_32, implicit $mode, implicit $exec + %125:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %124:vgpr_32, implicit $mode, implicit $exec + %126:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %127:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %126:vgpr_32, implicit $mode, implicit $exec + %128:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %127:vgpr_32, implicit $mode, implicit $exec + %129:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %130:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %129:vgpr_32, implicit $mode, implicit $exec + %131:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %130:vgpr_32, implicit $mode, implicit $exec + %132:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %133:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %132:vgpr_32, implicit $mode, implicit $exec + %134:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %133:vgpr_32, implicit $mode, implicit $exec + %135:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %136:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %135:vgpr_32, implicit $mode, implicit $exec + %137:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %136:vgpr_32, implicit $mode, implicit $exec + %138:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %139:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %138:vgpr_32, implicit $mode, implicit $exec + %140:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %139:vgpr_32, implicit $mode, implicit $exec + %141:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %142:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %141:vgpr_32, implicit $mode, implicit $exec + %143:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %142:vgpr_32, implicit $mode, implicit $exec + %144:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %145:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %144:vgpr_32, implicit $mode, implicit $exec + %146:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %145:vgpr_32, implicit $mode, implicit $exec + %147:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %148:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %147:vgpr_32, implicit $mode, implicit $exec + %149:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %148:vgpr_32, implicit $mode, implicit $exec + %150:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %151:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %150:vgpr_32, implicit $mode, implicit $exec + %152:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %151:vgpr_32, implicit $mode, implicit $exec + %153:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %154:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %153:vgpr_32, implicit $mode, implicit $exec + %155:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %154:vgpr_32, implicit $mode, implicit $exec + %156:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %156:vgpr_32, implicit $mode, implicit $exec + %158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %157:vgpr_32, implicit $mode, implicit $exec + %159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %159:vgpr_32, implicit $mode, implicit $exec + %161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %160:vgpr_32, implicit $mode, implicit $exec + %162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %162:vgpr_32, implicit $mode, implicit $exec + %164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %163:vgpr_32, implicit $mode, implicit $exec + %165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %165:vgpr_32, implicit $mode, implicit $exec + %167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %166:vgpr_32, implicit $mode, implicit $exec + %168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %168:vgpr_32, implicit $mode, implicit $exec + %170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %169:vgpr_32, implicit $mode, implicit $exec + %171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %171:vgpr_32, implicit $mode, implicit $exec + %173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %172:vgpr_32, implicit $mode, implicit $exec + %174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %174:vgpr_32, implicit $mode, implicit $exec + %176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %175:vgpr_32, implicit $mode, implicit $exec + %177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %177:vgpr_32, implicit $mode, implicit $exec + %179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %178:vgpr_32, implicit $mode, implicit $exec + %180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %180:vgpr_32, implicit $mode, implicit $exec + %182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %181:vgpr_32, implicit $mode, implicit $exec + %183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %183:vgpr_32, implicit $mode, implicit $exec + %185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %184:vgpr_32, implicit $mode, implicit $exec + %186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %186:vgpr_32, implicit $mode, implicit $exec + %188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %187:vgpr_32, implicit $mode, implicit $exec + %189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %189:vgpr_32, implicit $mode, implicit $exec + %191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %190:vgpr_32, implicit $mode, implicit $exec + %192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %192:vgpr_32, implicit $mode, implicit $exec + %194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %193:vgpr_32, implicit $mode, implicit $exec + %195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %195:vgpr_32, implicit $mode, implicit $exec + %197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %196:vgpr_32, implicit $mode, implicit $exec + %198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %198:vgpr_32, implicit $mode, implicit $exec + %200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %199:vgpr_32, implicit $mode, implicit $exec + %201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %201:vgpr_32, implicit $mode, implicit $exec + %203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %202:vgpr_32, implicit $mode, implicit $exec + %204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %204:vgpr_32, implicit $mode, implicit $exec + %206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %205:vgpr_32, implicit $mode, implicit $exec + %207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec + %208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %207:vgpr_32, implicit $mode, implicit $exec + %209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %208:vgpr_32, implicit $mode, implicit $exec + %210:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %116:vgpr_32, implicit $mode, implicit $exec + %211:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %119:vgpr_32, %210:vgpr_32, implicit $mode, implicit $exec + %212:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %122:vgpr_32, %211:vgpr_32, implicit $mode, implicit $exec + %213:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %125:vgpr_32, %212:vgpr_32, implicit $mode, implicit $exec + %214:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %128:vgpr_32, %213:vgpr_32, implicit $mode, implicit $exec + %215:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %131:vgpr_32, %214:vgpr_32, implicit $mode, implicit $exec + %216:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %134:vgpr_32, %215:vgpr_32, implicit $mode, implicit $exec + %217:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %137:vgpr_32, %216:vgpr_32, implicit $mode, implicit $exec + %218:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %140:vgpr_32, %217:vgpr_32, implicit $mode, implicit $exec + %219:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %143:vgpr_32, %218:vgpr_32, implicit $mode, implicit $exec + %220:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %146:vgpr_32, %219:vgpr_32, implicit $mode, implicit $exec + %221:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %149:vgpr_32, %220:vgpr_32, implicit $mode, implicit $exec + %222:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %152:vgpr_32, %221:vgpr_32, implicit $mode, implicit $exec + %223:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %155:vgpr_32, %222:vgpr_32, implicit $mode, implicit $exec + %224:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %158:vgpr_32, %223:vgpr_32, implicit $mode, implicit $exec + %225:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %161:vgpr_32, %224:vgpr_32, implicit $mode, implicit $exec + %226:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %164:vgpr_32, %225:vgpr_32, implicit $mode, implicit $exec + %227:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %167:vgpr_32, %226:vgpr_32, implicit $mode, implicit $exec + %228:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %170:vgpr_32, %227:vgpr_32, implicit $mode, implicit $exec + %229:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %173:vgpr_32, %228:vgpr_32, implicit $mode, implicit $exec + %230:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %176:vgpr_32, %229:vgpr_32, implicit $mode, implicit $exec + %231:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %179:vgpr_32, %230:vgpr_32, implicit $mode, implicit $exec + %232:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %182:vgpr_32, %231:vgpr_32, implicit $mode, implicit $exec + %233:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %185:vgpr_32, %232:vgpr_32, implicit $mode, implicit $exec + %234:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %188:vgpr_32, %233:vgpr_32, implicit $mode, implicit $exec + %235:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %191:vgpr_32, %234:vgpr_32, implicit $mode, implicit $exec + %236:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %194:vgpr_32, %235:vgpr_32, implicit $mode, implicit $exec + %237:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %197:vgpr_32, %236:vgpr_32, implicit $mode, implicit $exec + %238:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %200:vgpr_32, %237:vgpr_32, implicit $mode, implicit $exec + %239:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %203:vgpr_32, %238:vgpr_32, implicit $mode, implicit $exec + %240:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %206:vgpr_32, %239:vgpr_32, implicit $mode, implicit $exec + %241:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %209:vgpr_32, %240:vgpr_32, implicit $mode, implicit $exec + %242:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %241:vgpr_32, 0, implicit $exec + %243:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %241:vgpr_32, %242:vgpr_32, implicit $mode, implicit $exec + %244:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %243:vgpr_32, 0, implicit $exec + %0:vgpr_32 = V_CNDMASK_B32_e64 0, %244:vgpr_32, 0, %243:vgpr_32, %12:sreg_64_xexec, implicit $exec + %245:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %13:vgpr_32, %113:vgpr_32, implicit $mode, implicit $exec + %246:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %245:vgpr_32, implicit $mode, implicit $exec + undef %247.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %246:vgpr_32, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %248:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %14:sreg_32, implicit $exec + %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec + %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec + %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec + %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec + %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec + %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec + %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec + DS_WRITE_B32_gfx9 %255:vgpr_32, %248:vgpr_32, 0, 0, implicit $exec + %256:vgpr_32 = V_LSHL_ADD_U32_e64 %19:vgpr_32, 1, %255:vgpr_32, implicit $exec + DS_WRITE_B32_gfx9 %256:vgpr_32, %249:vgpr_32, 0, 0, implicit $exec + %257:vgpr_32 = V_LSHL_ADD_U32_e64 %20:vgpr_32, 1, %256:vgpr_32, implicit $exec + DS_WRITE_B32_gfx9 %257:vgpr_32, %250:vgpr_32, 0, 0, implicit $exec + %258:vgpr_32 = V_LSHL_ADD_U32_e64 %21:vgpr_32, 1, %257:vgpr_32, implicit $exec + DS_WRITE_B32_gfx9 %258:vgpr_32, %251:vgpr_32, 0, 0, implicit $exec + %0:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %1:vgpr_32, %247.sub0:vreg_64_align2, %0:vgpr_32, implicit $mode, implicit $exec + %259.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %259.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %260.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %261:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %116:vgpr_32, implicit $mode, implicit $exec + %262:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %119:vgpr_32, implicit $mode, implicit $exec + %263:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %122:vgpr_32, implicit $mode, implicit $exec + %264:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %128:vgpr_32, implicit $mode, implicit $exec + %265:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %131:vgpr_32, implicit $mode, implicit $exec + %266:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %134:vgpr_32, implicit $mode, implicit $exec + %267:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %140:vgpr_32, implicit $mode, implicit $exec + %268:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %143:vgpr_32, implicit $mode, implicit $exec + %269:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %146:vgpr_32, implicit $mode, implicit $exec + %270:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %152:vgpr_32, implicit $mode, implicit $exec + %271:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %155:vgpr_32, implicit $mode, implicit $exec + %272:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %158:vgpr_32, implicit $mode, implicit $exec + %273:vgpr_32 = V_ADD_U32_e32 %22:vgpr_32, %37:vgpr_32, implicit $exec + %274:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %273:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec + %275:vgpr_32 = V_ADD_U32_e32 %23:vgpr_32, %37:vgpr_32, implicit $exec + %276:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %275:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %277:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %278:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec + %279:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec + %280:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %281:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %14:sreg_32, implicit $exec + %282:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %15:sreg_32, implicit $exec + %283:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %14:sreg_32, implicit $exec + %284:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %15:sreg_32, implicit $exec + DS_WRITE_B32_gfx9 %255:vgpr_32, %281:vgpr_32, 0, 0, implicit $exec + DS_WRITE_B32_gfx9 %256:vgpr_32, %282:vgpr_32, 0, 0, implicit $exec + DS_WRITE_B32_gfx9 %257:vgpr_32, %283:vgpr_32, 0, 0, implicit $exec + DS_WRITE_B32_gfx9 %258:vgpr_32, %284:vgpr_32, 0, 0, implicit $exec + %285:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %125:vgpr_32, implicit $mode, implicit $exec + %286:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %137:vgpr_32, implicit $mode, implicit $exec + %287:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %149:vgpr_32, implicit $mode, implicit $exec + %288:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %161:vgpr_32, implicit $mode, implicit $exec + undef %289.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %263:vgpr_32, 0, %285:vgpr_32, 0, 0, implicit $mode, implicit $exec + %289.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %261:vgpr_32, 0, %262:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %290.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %266:vgpr_32, 0, %286:vgpr_32, 0, 0, implicit $mode, implicit $exec + %290.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %264:vgpr_32, 0, %265:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %291.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %269:vgpr_32, 0, %287:vgpr_32, 0, 0, implicit $mode, implicit $exec + %291.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %267:vgpr_32, 0, %268:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %292.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %272:vgpr_32, 0, %288:vgpr_32, 0, 0, implicit $mode, implicit $exec + %292.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %270:vgpr_32, 0, %271:vgpr_32, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %293:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %209:vgpr_32, implicit $mode, implicit $exec + %294:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %203:vgpr_32, implicit $mode, implicit $exec + %295:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %197:vgpr_32, implicit $mode, implicit $exec + %296:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %191:vgpr_32, implicit $mode, implicit $exec + %297:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %185:vgpr_32, implicit $mode, implicit $exec + %298:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %179:vgpr_32, implicit $mode, implicit $exec + %299:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %173:vgpr_32, implicit $mode, implicit $exec + %300:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %167:vgpr_32, implicit $mode, implicit $exec + %301:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %206:vgpr_32, implicit $mode, implicit $exec + %302:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %200:vgpr_32, implicit $mode, implicit $exec + %303:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %194:vgpr_32, implicit $mode, implicit $exec + %304:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %188:vgpr_32, implicit $mode, implicit $exec + %305:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %182:vgpr_32, implicit $mode, implicit $exec + %306:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %176:vgpr_32, implicit $mode, implicit $exec + %307:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %170:vgpr_32, implicit $mode, implicit $exec + %308:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %164:vgpr_32, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + undef %309.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %307:vgpr_32, 0, %299:vgpr_32, 0, 0, implicit $mode, implicit $exec + %309.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %308:vgpr_32, 0, %300:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %310.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %305:vgpr_32, 0, %297:vgpr_32, 0, 0, implicit $mode, implicit $exec + %310.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %306:vgpr_32, 0, %298:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %311.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %303:vgpr_32, 0, %295:vgpr_32, 0, 0, implicit $mode, implicit $exec + %311.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %304:vgpr_32, 0, %296:vgpr_32, 0, 0, implicit $mode, implicit $exec + undef %312.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %301:vgpr_32, 0, %293:vgpr_32, 0, 0, implicit $mode, implicit $exec + %312.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %302:vgpr_32, 0, %294:vgpr_32, 0, 0, implicit $mode, implicit $exec + %313:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %314:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %315:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %316:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0 + %37:vgpr_32 = V_ADD_U32_e32 %26:sreg_32, %37:vgpr_32, implicit $exec + %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec + S_ENDPGM 0 +... + +