diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 403014db56171..323560a46f31d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -335,6 +335,9 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; +void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &); +extern char &AMDGPUInsertSingleUseVDSTID; + void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp new file mode 100644 index 0000000000000..93ed77bb6f7ef --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp @@ -0,0 +1,122 @@ +//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU +/// instructions that produce single-use VGPR values. If the value is forwarded +/// to the consumer instruction prior to VGPR writeback, the hardware can +/// then skip (kill) the VGPR write. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCRegister.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-insert-single-use-vdst" + +namespace { +class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { +private: + const SIInstrInfo *SII; + +public: + static char ID; + + AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} + + void emitSingleUseVDST(MachineInstr &MI) const { + // Mark the following instruction as a single-use producer: + // s_singleuse_vdst { supr0: 1 } + BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST)) + .addImm(0x1); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + const auto &ST = MF.getSubtarget(); + if (!ST.hasVGPRSingleUseHintInsts()) + return false; + + SII = ST.getInstrInfo(); + const auto *TRI = &SII->getRegisterInfo(); + bool InstructionEmitted = false; + + for (MachineBasicBlock &MBB : MF) { + DenseMap RegisterUseCount; // TODO: MCRegUnits + + // Handle boundaries at the end of basic block separately to avoid + // false positives. If they are live at the end of a basic block then + // assume it has more uses later on. + for (const auto &Liveouts : MBB.liveouts()) + RegisterUseCount[Liveouts.PhysReg] = 2; + + for (MachineInstr &MI : reverse(MBB.instrs())) { + // All registers in all operands need to be single use for an + // instruction to be marked as a single use producer. + bool AllProducerOperandsAreSingleUse = true; + + for (const auto &Operand : MI.operands()) { + if (!Operand.isReg()) + continue; + const auto Reg = Operand.getReg(); + + // Count the number of times each register is read. + if (Operand.readsReg()) + RegisterUseCount[Reg]++; + + // Do not attempt to optimise across exec mask changes. + if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + for (auto &UsedReg : RegisterUseCount) + UsedReg.second = 2; + } + + // If we are at the point where the register first became live, + // check if the operands are single use. + if (!MI.modifiesRegister(Reg, TRI)) + continue; + if (RegisterUseCount[Reg] > 1) + AllProducerOperandsAreSingleUse = false; + // Reset uses count when a register is no longer live. + RegisterUseCount.erase(Reg); + } + if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) { + // TODO: Replace with candidate logging for instruction grouping + // later. + emitSingleUseVDST(MI); + InstructionEmitted = true; + } + } + } + return InstructionEmitted; + } +}; +} // namespace + +char AMDGPUInsertSingleUseVDST::ID = 0; + +char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; + +INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, + "AMDGPU Insert SingleUseVDST", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 951ed9420594b..0c38fa32c6f33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -286,6 +286,12 @@ static cl::opt EnableSIModeRegisterPass( cl::init(true), cl::Hidden); +// Enable GFX11.5+ s_singleuse_vdst insertion +static cl::opt + EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", + cl::desc("Enable s_singleuse_vdst insertion"), + cl::init(false), cl::Hidden); + // Enable GFX11+ s_delay_alu insertion static cl::opt EnableInsertDelayAlu("amdgpu-enable-delay-alu", @@ -404,6 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeAMDGPUInsertSingleUseVDSTPass(*PR); initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); @@ -1448,6 +1455,9 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); + if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less)) + addPass(&AMDGPUInsertSingleUseVDSTID); + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 0c0720890794b..53a33f8210d2a 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -77,6 +77,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUIGroupLP.cpp + AMDGPUInsertSingleUseVDST.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir new file mode 100644 index 0000000000000..5b75e42b75d29 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir @@ -0,0 +1,627 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -march=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s + +# One single-use producer. +--- +name: one_producer +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: one_producer + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + bb.1: + liveins: $vgpr0, $vgpr2 +... + +# One single-use producer of a 64-bit value. +--- +name: one_producer_64bit +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: one_producer_64bit + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0_vgpr1 + $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec + $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec + bb.1: + liveins: $vgpr4_vgpr5 +... + +# Two consecutive single-use producers. +--- +name: two_producers +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: two_producers + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr3 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + bb.1: + liveins: $vgpr0, $vgpr3 +... + +# Redefinitions of v0. +--- +name: redefinitions +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: redefinitions + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: +... + +# One producer with no consumers. +--- +name: no_consumer +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_consumer + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + bb.0: + liveins: $vgpr0 + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + bb.1: +... + +# One consumer with two uses of the same value. +--- +name: one_consumer_two_uses +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: one_consumer_two_uses + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec + bb.1: + liveins: $vgpr0, $vgpr2 +... + +# A longer example. +--- +name: longer_example +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: longer_example + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode + ; CHECK-NEXT: $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode + ; CHECK-NEXT: $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode + ; CHECK-NEXT: $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode + ; CHECK-NEXT: $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode + ; CHECK-NEXT: $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode + ; CHECK-NEXT: $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr16, $vgpr18 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19 + $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode + $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode + $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode + $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode + $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode + $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode + $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode + $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode + $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode + $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode + bb.1: + liveins: $vgpr16, $vgpr18 +... + +# Multiple uses of v0. +--- +name: multiple_uses_1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: multiple_uses_1 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1, $vgpr2 +... + +# Multiple uses of v0 and redefinitions of v1 and v2. +--- +name: multiple_uses_2 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: multiple_uses_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1, $vgpr2 +... + +# Multiple uses of all but v1. +--- +name: multiple_uses_3 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: multiple_uses_3 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + bb.1: + liveins: $vgpr2, $vgpr3 +... + +# Results are live-in to another basic block. +--- +name: basic_block_1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: basic_block_1 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr0, $vgpr1, $vgpr2 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.2: + liveins: $vgpr1, $vgpr2 +... + +# Result v2 has multiple uses in another basic block. +--- +name: basic_block_2 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: basic_block_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr3 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0, $vgpr1 + $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec + bb.1: + liveins: $vgpr2 + $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec + bb.2: + liveins: $vgpr3 +... + +# Results are redefined in another basic block. +--- +name: basic_block_3 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: basic_block_3 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr0, $vgpr1 + $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec + bb.2: + liveins: $vgpr0, $vgpr1, $vgpr2 +... + +# Exec modified between producer and consumer. +--- +name: exec_mask +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: exec_mask + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $exec = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $sgpr0_sgpr1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec = COPY $sgpr0_sgpr1 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr0 +... + +# Exec_lo modified between producer and consumer. +--- +name: exec_mask_lo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: exec_mask_lo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $exec_lo = COPY $sgpr0 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $sgpr0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec_lo = COPY $sgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr0 +... + +# Exec_hi modified between producer and consumer. +--- +name: exec_mask_hi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: exec_mask_hi + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $exec_hi = COPY $sgpr0 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $sgpr0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $exec_hi = COPY $sgpr0 + $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr0 +... + +# Write 32-bit vgpr and then read from low 16 bits. +--- +name: write_full_read_lo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_full_read_lo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1_lo16 + ; CHECK-NEXT: {{ $}} + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec + bb.1: + liveins: $vgpr1_lo16 +... + +# Write 32-bit vgpr and then read from high 16 bits. +--- +name: write_full_read_hi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_full_read_hi + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1_hi16 + ; CHECK-NEXT: {{ $}} + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec + bb.1: + liveins: $vgpr1_hi16 +... + +# Write 32-bit vgpr and then read from both halves. +--- +name: write_full_read_both +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_full_read_both + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1 + ; CHECK-NEXT: {{ $}} + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec + $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec + bb.1: + liveins: $vgpr1 +... + +# Write 32-bit vgpr and then read from both halves in the same instruction. +--- +name: write_full_read_both_same_instruction +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_full_read_both_same_instruction + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1_lo16 + ; CHECK-NEXT: {{ $}} + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec + bb.1: + liveins: $vgpr1_lo16 +... + +# Write low 16-bits and then read 32-bit vgpr. +--- +name: write_lo_read_full +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_lo_read_full + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1 +... + +# Write high 16-bits and then read 32-bit vgpr. +--- +name: write_hi_read_full +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: write_hi_read_full + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_SINGLEUSE_VDST 1 + ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr1 + ; CHECK-NEXT: {{ $}} + bb.0: + liveins: $vgpr0 + $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec + bb.1: + liveins: $vgpr1 +...