diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a55a1747cafe61..fbed51de0ea497 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -156,6 +156,9 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; +void initializeSIRemoveShortExecBranchesPass(PassRegistry &); +extern char &SIRemoveShortExecBranchesID; + void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c8dc6f6e3bf4c7..eb30d659bf0b5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); + initializeSIRemoveShortExecBranchesPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); @@ -993,6 +994,7 @@ void GCNPassConfig::addPreEmitPass() { // be better for it to emit S_NOP when possible. addPass(&PostRAHazardRecognizerID); + addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 3ed35e57e548ff..0b8eb4b25ae46d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp SIRegisterInfo.cpp + SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 87e63fcc4a04fa..80c044ec00cb39 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -41,7 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "si-insert-skips" static cl::opt SkipThresholdFlag( - "amdgpu-skip-threshold", + "amdgpu-skip-threshold-legacy", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden); @@ -466,6 +466,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_EXECZ: + ExecBranchStack.push_back(MI.getOperand(0).getMBB()); + break; case AMDGPU::SI_MASK_BRANCH: ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index bf052dc3c9304e..61d2719a3aad61 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -244,9 +244,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); - // Insert a pseudo terminator to help keep the verifier happy. This will also - // be used later when inserting skips. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + // Insert the S_CBRANCH_EXECZ instruction which will be optimized later + // during SIRemoveShortExecBranches. + MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) .add(MI.getOperand(2)); if (!LIS) { @@ -323,8 +323,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(DstReg); MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addMBB(DestBB); + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(DestBB); if (!LIS) { MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp new file mode 100644 index 00000000000000..51779e97ac6207 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp @@ -0,0 +1,158 @@ +//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass optmizes the s_cbranch_execz instructions. +/// The pass removes this skip instruction for short branches, +/// if there is no unwanted sideeffect in the fallthrough code sequence. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-remove-short-exec-branches" + +static unsigned SkipThreshold; + +static cl::opt SkipThresholdFlag( + "amdgpu-skip-threshold", cl::Hidden, + cl::desc( + "Number of instructions before jumping over divergent control flow"), + cl::location(SkipThreshold), cl::init(12)); + +namespace { + +class SIRemoveShortExecBranches : public MachineFunctionPass { +private: + const SIInstrInfo *TII = nullptr; + bool getBlockDestinations(MachineBasicBlock &SrcMBB, + MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, + SmallVectorImpl &Cond); + bool mustRetainExeczBranch(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); + +public: + static char ID; + + SIRemoveShortExecBranches() : MachineFunctionPass(ID) { + initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, + "SI remove short exec branches", false, false) + +char SIRemoveShortExecBranches::ID = 0; + +char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; + +bool SIRemoveShortExecBranches::getBlockDestinations( + MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond) { + if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + if (!FalseMBB) + FalseMBB = SrcMBB.getNextNode(); + + return true; +} + +bool SIRemoveShortExecBranches::mustRetainExeczBranch( + const MachineBasicBlock &From, const MachineBasicBlock &To) const { + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + + ++NumInstr; + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +// Returns true if the skip branch instruction is removed. +bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *TrueMBB = nullptr; + MachineBasicBlock *FalseMBB = nullptr; + SmallVector Cond; + + if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + // Consider only the forward branches. + if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || + mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + return false; + + LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); + MI.eraseFromParent(); + SrcMBB.removeSuccessor(TrueMBB); + + return true; +} + +bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + MF.RenumberBlocks(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.end()) + continue; + + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_EXECZ: + Changed = removeExeczBranch(MI, MBB); + break; + default: + break; + } + } + + return Changed; +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 40e182067023dc..d787e40707be37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -10,9 +10,8 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: ; mask branch BB0_2 ; CHECK-NEXT: s_cbranch_execz BB0_2 -; CHECK-NEXT: BB0_1: ; %if.true +; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] @@ -38,12 +37,10 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: ; mask branch BB1_2 -; CHECK-NEXT: BB1_1: ; %endif -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: BB1_2: ; %if.true +; CHECK-NEXT: s_cbranch_execnz BB1_2 +; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: BB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 2fec729a3da9c5..84d39acb17febe 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -24,9 +24,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB0_2 ; GFX7LESS-NEXT: s_cbranch_execz BB0_2 -; GFX7LESS-NEXT: BB0_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 @@ -54,9 +53,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB0_2 ; GFX8-NEXT: s_cbranch_execz BB0_2 -; GFX8-NEXT: BB0_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo @@ -85,9 +83,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB0_2 ; GFX9-NEXT: s_cbranch_execz BB0_2 -; GFX9-NEXT: BB0_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo @@ -115,9 +112,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB0_2 ; GFX1064-NEXT: s_cbranch_execz BB0_2 -; GFX1064-NEXT: BB0_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 @@ -148,9 +144,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB0_2 ; GFX1032-NEXT: s_cbranch_execz BB0_2 -; GFX1032-NEXT: BB0_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 @@ -190,9 +185,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: ; mask branch BB1_2 ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 -; GFX7LESS-NEXT: BB1_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 @@ -224,9 +218,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB1_2 ; GFX8-NEXT: s_cbranch_execz BB1_2 -; GFX8-NEXT: BB1_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 @@ -258,9 +251,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB1_2 ; GFX9-NEXT: s_cbranch_execz BB1_2 -; GFX9-NEXT: BB1_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s1, s0, s1 @@ -291,9 +283,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1064-NEXT: ; mask branch BB1_2 ; GFX1064-NEXT: s_cbranch_execz BB1_2 -; GFX1064-NEXT: BB1_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -327,9 +318,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: ; mask branch BB1_2 ; GFX1032-NEXT: s_cbranch_execz BB1_2 -; GFX1032-NEXT: BB1_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -415,9 +405,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB2_2 ; GFX8-NEXT: s_cbranch_execz BB2_2 -; GFX8-NEXT: BB2_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -469,9 +458,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB2_2 ; GFX9-NEXT: s_cbranch_execz BB2_2 -; GFX9-NEXT: BB2_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -527,9 +515,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB2_2 ; GFX1064-NEXT: s_cbranch_execz BB2_2 -; GFX1064-NEXT: BB2_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -580,9 +567,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB2_2 ; GFX1032-NEXT: s_cbranch_execz BB2_2 -; GFX1032-NEXT: BB2_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -658,9 +644,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB3_2 ; GFX8-NEXT: s_cbranch_execz BB3_2 -; GFX8-NEXT: BB3_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -712,9 +697,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB3_2 ; GFX9-NEXT: s_cbranch_execz BB3_2 -; GFX9-NEXT: BB3_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -770,9 +754,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB3_2 ; GFX1064-NEXT: s_cbranch_execz BB3_2 -; GFX1064-NEXT: BB3_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -823,9 +806,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB3_2 ; GFX1032-NEXT: s_cbranch_execz BB3_2 -; GFX1032-NEXT: BB3_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -901,9 +883,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB4_2 ; GFX8-NEXT: s_cbranch_execz BB4_2 -; GFX8-NEXT: BB4_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -955,9 +936,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB4_2 ; GFX9-NEXT: s_cbranch_execz BB4_2 -; GFX9-NEXT: BB4_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1013,9 +993,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB4_2 ; GFX1064-NEXT: s_cbranch_execz BB4_2 -; GFX1064-NEXT: BB4_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1066,9 +1045,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB4_2 ; GFX1032-NEXT: s_cbranch_execz BB4_2 -; GFX1032-NEXT: BB4_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1107,9 +1085,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB5_2 ; GFX7LESS-NEXT: s_cbranch_execz BB5_2 -; GFX7LESS-NEXT: BB5_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1143,9 +1120,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB5_2 ; GFX8-NEXT: s_cbranch_execz BB5_2 -; GFX8-NEXT: BB5_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 @@ -1178,9 +1154,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB5_2 ; GFX9-NEXT: s_cbranch_execz BB5_2 -; GFX9-NEXT: BB5_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 @@ -1212,9 +1187,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB5_2 ; GFX1064-NEXT: s_cbranch_execz BB5_2 -; GFX1064-NEXT: BB5_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1247,9 +1221,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB5_2 ; GFX1032-NEXT: s_cbranch_execz BB5_2 -; GFX1032-NEXT: BB5_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 @@ -1290,9 +1263,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: ; mask branch BB6_2 ; GFX7LESS-NEXT: s_cbranch_execz BB6_2 -; GFX7LESS-NEXT: BB6_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1335,9 +1307,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB6_2 ; GFX8-NEXT: s_cbranch_execz BB6_2 -; GFX8-NEXT: BB6_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1380,9 +1351,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB6_2 ; GFX9-NEXT: s_cbranch_execz BB6_2 -; GFX9-NEXT: BB6_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1424,9 +1394,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB6_2 ; GFX1064-NEXT: s_cbranch_execz BB6_2 -; GFX1064-NEXT: BB6_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1467,9 +1436,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB6_2 ; GFX1032-NEXT: s_cbranch_execz BB6_2 -; GFX1032-NEXT: BB6_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1608,9 +1576,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB8_2 ; GFX7LESS-NEXT: s_cbranch_execz BB8_2 -; GFX7LESS-NEXT: BB8_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 @@ -1639,9 +1606,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB8_2 ; GFX8-NEXT: s_cbranch_execz BB8_2 -; GFX8-NEXT: BB8_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo @@ -1671,9 +1637,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB8_2 ; GFX9-NEXT: s_cbranch_execz BB8_2 -; GFX9-NEXT: BB8_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo @@ -1702,9 +1667,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB8_2 ; GFX1064-NEXT: s_cbranch_execz BB8_2 -; GFX1064-NEXT: BB8_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 @@ -1736,9 +1700,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB8_2 ; GFX1032-NEXT: s_cbranch_execz BB8_2 -; GFX1032-NEXT: BB8_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 @@ -1779,9 +1742,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: ; mask branch BB9_2 ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 -; GFX7LESS-NEXT: BB9_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 @@ -1813,9 +1775,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB9_2 ; GFX8-NEXT: s_cbranch_execz BB9_2 -; GFX8-NEXT: BB9_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 @@ -1847,9 +1808,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB9_2 ; GFX9-NEXT: s_cbranch_execz BB9_2 -; GFX9-NEXT: BB9_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s1, s0, s1 @@ -1880,9 +1840,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1064-NEXT: ; mask branch BB9_2 ; GFX1064-NEXT: s_cbranch_execz BB9_2 -; GFX1064-NEXT: BB9_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1916,9 +1875,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: ; mask branch BB9_2 ; GFX1032-NEXT: s_cbranch_execz BB9_2 -; GFX1032-NEXT: BB9_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2004,9 +1962,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB10_2 ; GFX8-NEXT: s_cbranch_execz BB10_2 -; GFX8-NEXT: BB10_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -2058,9 +2015,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB10_2 ; GFX9-NEXT: s_cbranch_execz BB10_2 -; GFX9-NEXT: BB10_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2116,9 +2072,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB10_2 ; GFX1064-NEXT: s_cbranch_execz BB10_2 -; GFX1064-NEXT: BB10_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2169,9 +2124,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB10_2 ; GFX1032-NEXT: s_cbranch_execz BB10_2 -; GFX1032-NEXT: BB10_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2210,9 +2164,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB11_2 ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 -; GFX7LESS-NEXT: BB11_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -2246,9 +2199,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB11_2 ; GFX8-NEXT: s_cbranch_execz BB11_2 -; GFX8-NEXT: BB11_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 @@ -2282,9 +2234,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB11_2 ; GFX9-NEXT: s_cbranch_execz BB11_2 -; GFX9-NEXT: BB11_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 @@ -2317,9 +2268,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB11_2 ; GFX1064-NEXT: s_cbranch_execz BB11_2 -; GFX1064-NEXT: BB11_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -2354,9 +2304,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB11_2 ; GFX1032-NEXT: s_cbranch_execz BB11_2 -; GFX1032-NEXT: BB11_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 @@ -2399,9 +2348,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: ; mask branch BB12_2 ; GFX7LESS-NEXT: s_cbranch_execz BB12_2 -; GFX7LESS-NEXT: BB12_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2444,9 +2392,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB12_2 ; GFX8-NEXT: s_cbranch_execz BB12_2 -; GFX8-NEXT: BB12_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2489,9 +2436,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB12_2 ; GFX9-NEXT: s_cbranch_execz BB12_2 -; GFX9-NEXT: BB12_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2533,9 +2479,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB12_2 ; GFX1064-NEXT: s_cbranch_execz BB12_2 -; GFX1064-NEXT: BB12_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2576,9 +2521,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB12_2 ; GFX1032-NEXT: s_cbranch_execz BB12_2 -; GFX1032-NEXT: BB12_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2757,9 +2701,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB14_2 ; GFX8-NEXT: s_cbranch_execz BB14_2 -; GFX8-NEXT: BB14_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -2811,9 +2754,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB14_2 ; GFX9-NEXT: s_cbranch_execz BB14_2 -; GFX9-NEXT: BB14_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2869,9 +2811,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB14_2 ; GFX1064-NEXT: s_cbranch_execz BB14_2 -; GFX1064-NEXT: BB14_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2922,9 +2863,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB14_2 ; GFX1032-NEXT: s_cbranch_execz BB14_2 -; GFX1032-NEXT: BB14_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3003,9 +2943,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB15_2 ; GFX8-NEXT: s_cbranch_execz BB15_2 -; GFX8-NEXT: BB15_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -3057,9 +2996,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB15_2 ; GFX9-NEXT: s_cbranch_execz BB15_2 -; GFX9-NEXT: BB15_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3115,9 +3053,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB15_2 ; GFX1064-NEXT: s_cbranch_execz BB15_2 -; GFX1064-NEXT: BB15_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3168,9 +3105,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB15_2 ; GFX1032-NEXT: s_cbranch_execz BB15_2 -; GFX1032-NEXT: BB15_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3249,9 +3185,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB16_2 ; GFX8-NEXT: s_cbranch_execz BB16_2 -; GFX8-NEXT: BB16_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -3303,9 +3238,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB16_2 ; GFX9-NEXT: s_cbranch_execz BB16_2 -; GFX9-NEXT: BB16_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3361,9 +3295,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB16_2 ; GFX1064-NEXT: s_cbranch_execz BB16_2 -; GFX1064-NEXT: BB16_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3414,9 +3347,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB16_2 ; GFX1032-NEXT: s_cbranch_execz BB16_2 -; GFX1032-NEXT: BB16_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3495,9 +3427,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB17_2 ; GFX8-NEXT: s_cbranch_execz BB17_2 -; GFX8-NEXT: BB17_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -3549,9 +3480,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB17_2 ; GFX9-NEXT: s_cbranch_execz BB17_2 -; GFX9-NEXT: BB17_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3607,9 +3537,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB17_2 ; GFX1064-NEXT: s_cbranch_execz BB17_2 -; GFX1064-NEXT: BB17_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3660,9 +3589,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB17_2 ; GFX1032-NEXT: s_cbranch_execz BB17_2 -; GFX1032-NEXT: BB17_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3701,9 +3629,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB18_2 ; GFX7LESS-NEXT: s_cbranch_execz BB18_2 -; GFX7LESS-NEXT: BB18_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3739,9 +3666,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB18_2 ; GFX8-NEXT: s_cbranch_execz BB18_2 -; GFX8-NEXT: BB18_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -3777,9 +3703,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB18_2 ; GFX9-NEXT: s_cbranch_execz BB18_2 -; GFX9-NEXT: BB18_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -3814,9 +3739,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB18_2 ; GFX1064-NEXT: s_cbranch_execz BB18_2 -; GFX1064-NEXT: BB18_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -3851,9 +3775,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB18_2 ; GFX1032-NEXT: s_cbranch_execz BB18_2 -; GFX1032-NEXT: BB18_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -3936,9 +3859,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB19_2 ; GFX8-NEXT: s_cbranch_execz BB19_2 -; GFX8-NEXT: BB19_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -3990,9 +3912,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB19_2 ; GFX9-NEXT: s_cbranch_execz BB19_2 -; GFX9-NEXT: BB19_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4048,9 +3969,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB19_2 ; GFX1064-NEXT: s_cbranch_execz BB19_2 -; GFX1064-NEXT: BB19_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4101,9 +4021,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB19_2 ; GFX1032-NEXT: s_cbranch_execz BB19_2 -; GFX1032-NEXT: BB19_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4142,9 +4061,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB20_2 ; GFX7LESS-NEXT: s_cbranch_execz BB20_2 -; GFX7LESS-NEXT: BB20_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4180,9 +4098,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB20_2 ; GFX8-NEXT: s_cbranch_execz BB20_2 -; GFX8-NEXT: BB20_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -4218,9 +4135,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB20_2 ; GFX9-NEXT: s_cbranch_execz BB20_2 -; GFX9-NEXT: BB20_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4255,9 +4171,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB20_2 ; GFX1064-NEXT: s_cbranch_execz BB20_2 -; GFX1064-NEXT: BB20_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -4292,9 +4207,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB20_2 ; GFX1032-NEXT: s_cbranch_execz BB20_2 -; GFX1032-NEXT: BB20_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -4377,9 +4291,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB21_2 ; GFX8-NEXT: s_cbranch_execz BB21_2 -; GFX8-NEXT: BB21_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -4431,9 +4344,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB21_2 ; GFX9-NEXT: s_cbranch_execz BB21_2 -; GFX9-NEXT: BB21_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,9 +4401,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB21_2 ; GFX1064-NEXT: s_cbranch_execz BB21_2 -; GFX1064-NEXT: BB21_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4542,9 +4453,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB21_2 ; GFX1032-NEXT: s_cbranch_execz BB21_2 -; GFX1032-NEXT: BB21_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4583,9 +4493,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB22_2 ; GFX7LESS-NEXT: s_cbranch_execz BB22_2 -; GFX7LESS-NEXT: BB22_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4620,9 +4529,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB22_2 ; GFX8-NEXT: s_cbranch_execz BB22_2 -; GFX8-NEXT: BB22_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -4657,9 +4565,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB22_2 ; GFX9-NEXT: s_cbranch_execz BB22_2 -; GFX9-NEXT: BB22_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4693,9 +4600,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB22_2 ; GFX1064-NEXT: s_cbranch_execz BB22_2 -; GFX1064-NEXT: BB22_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -4730,9 +4636,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB22_2 ; GFX1032-NEXT: s_cbranch_execz BB22_2 -; GFX1032-NEXT: BB22_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -4815,9 +4720,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: ; mask branch BB23_2 ; GFX8-NEXT: s_cbranch_execz BB23_2 -; GFX8-NEXT: BB23_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -4869,9 +4773,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; mask branch BB23_2 ; GFX9-NEXT: s_cbranch_execz BB23_2 -; GFX9-NEXT: BB23_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4927,9 +4830,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: ; mask branch BB23_2 ; GFX1064-NEXT: s_cbranch_execz BB23_2 -; GFX1064-NEXT: BB23_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v7, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4980,9 +4882,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: ; mask branch BB23_2 ; GFX1032-NEXT: s_cbranch_execz BB23_2 -; GFX1032-NEXT: BB23_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v7, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5021,9 +4922,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: ; mask branch BB24_2 ; GFX7LESS-NEXT: s_cbranch_execz BB24_2 -; GFX7LESS-NEXT: BB24_1: +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5058,9 +4958,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: ; mask branch BB24_2 ; GFX8-NEXT: s_cbranch_execz BB24_2 -; GFX8-NEXT: BB24_1: +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5095,9 +4994,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: ; mask branch BB24_2 ; GFX9-NEXT: s_cbranch_execz BB24_2 -; GFX9-NEXT: BB24_1: +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5131,9 +5029,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: ; mask branch BB24_2 ; GFX1064-NEXT: s_cbranch_execz BB24_2 -; GFX1064-NEXT: BB24_1: +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5168,9 +5065,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: ; mask branch BB24_2 ; GFX1032-NEXT: s_cbranch_execz BB24_2 -; GFX1032-NEXT: BB24_1: +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 81d045b14d9e8f..39c2d997eb8596 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -11,7 +11,7 @@ declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) ; Show that what the atomic optimization pass will do for raw buffers. ; GCN-LABEL: add_i32_constant: -; GCN-LABEL: BB0_1: +; %bb.{{[0-9]+}}: ; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 ; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index d1fcd1547d57d4..4f03978c515968 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -14,12 +14,11 @@ ; GCN-DAG: v_cmp_lt_f32_e32 vcc, ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] -; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]] -; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4 ; GCN: ds_write_b32 -; GCN: [[BB5]] +; GCN: ; %bb.{{[0-9]+}}: ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end define amdgpu_ps void @ham(float %arg, float %arg1) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index a2facaafb41f9c..1131f5f3c196bf 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -389,7 +389,6 @@ bb3: ; GCN-LABEL: {{^}}uniform_inside_divergent: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry @@ -401,7 +400,7 @@ bb3: ; GCN-NEXT: [[IF]]: ; %if ; GCN: buffer_store_dword ; GCN: s_cmp_lg_u32 -; GCN: s_cbranch_scc1 [[ENDIF]] +; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: ; %bb.2: ; %if_uniform ; GCN: buffer_store_dword @@ -438,12 +437,10 @@ endif: ; GCN: v_cmp_nlt_f32_e32 vcc ; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN-NEXT: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]] -; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] -; GCN: [[FLOW]]: ; %Flow +; GCN: BB{{[0-9]+_[0-9]+}}: ; %Flow ; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]] ; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]] -; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]] ; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}} ; GCN: ;;#ASMSTART @@ -454,7 +451,7 @@ endif: ; GCN: v_nop_e64 ; GCN: v_nop_e64 ; GCN: ;;#ASMEND -; GCN: s_cbranch_vccz [[RET]] +; GCN: s_cbranch_vccz [[RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll index cd963df6c499c7..d99226ac45b717 100644 --- a/llvm/test/CodeGen/AMDGPU/call-skip.ll +++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll @@ -8,8 +8,7 @@ define hidden void @func() #1 { ; GCN-LABEL: {{^}}if_call: ; GCN: s_and_saveexec_b64 -; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_cbranch_execz [[END]] +; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]] ; GCN: s_swappc_b64 ; GCN: [[END]]: define void @if_call(i32 %flag) #0 { @@ -26,8 +25,7 @@ end: ; GCN-LABEL: {{^}}if_asm: ; GCN: s_and_saveexec_b64 -; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_cbranch_execz [[END]] +; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]] ; GCN: ; sample asm ; GCN: [[END]]: define void @if_asm(i32 %flag) #0 { @@ -44,8 +42,7 @@ end: ; GCN-LABEL: {{^}}if_call_kernel: ; GCN: s_and_saveexec_b64 -; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_cbranch_execz [[END]] +; GCN-NEXT: s_cbranch_execz BB3_2 ; GCN: s_swappc_b64 define amdgpu_kernel void @if_call_kernel() #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 6a8456d99bcebc..bb00e67fca25eb 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -3,12 +3,10 @@ ; ALL-LABEL: {{^}}simple_nested_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF]] +; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]] ; GCN: s_and_b64 exec, exec, vcc -; GCN-NEXT: ; mask branch [[ENDIF]] ; GCN-NEXT: s_cbranch_execz [[ENDIF]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] @@ -43,12 +41,10 @@ bb.outer.end: ; preds = %bb.outer.then, %bb. ; ALL-LABEL: {{^}}uncollapsable_nested_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF_INNER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_INNER]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[ENDIF_INNER:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_INNER]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]] @@ -88,18 +84,16 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; ALL-LABEL: {{^}}nested_if_if_else: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] ; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] ; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]] -; GCN-NEXT: ; mask branch [[THEN_INNER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[THEN_INNER]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[THEN_INNER:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[THEN_INNER]]: ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]] ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]] -; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] @@ -137,28 +131,24 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; ALL-LABEL: {{^}}nested_if_else_if: ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] ; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]] -; GCN-NEXT: ; mask branch [[THEN_OUTER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[THEN_OUTER]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]] ; GCN-NEXT: {{^}}[[THEN_OUTER]]: ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]] ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]] -; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[FLOW1]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[FLOW1:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: [[FLOW1]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]] @@ -203,9 +193,8 @@ bb.outer.end: ; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier: ; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] -; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF]] -; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]] +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; GCN: store_dword ; GCN-NEXT: {{^}}[[ENDIF]]: ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 15e807a3e02305..f144ed263ff392 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -35,9 +35,9 @@ ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} -; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] -; GCN: {{^}}BB{{[0-9]+}}_1: ; %if +; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: s_mov_b32 m0, -1 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] ; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload @@ -116,8 +116,7 @@ endif: ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} -; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_cbranch_execz [[END]] +; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]] ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: @@ -194,8 +193,7 @@ end: ; GCN: s_mov_b64 exec, [[CMP0]] ; FIXME: It makes no sense to put this skip here -; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] -; GCN: s_cbranch_execz [[FLOW]] +; GCN: s_cbranch_execz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_branch [[ELSE:BB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow @@ -229,11 +227,10 @@ end: ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_cbranch_execz [[ENDIF]] +; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] -; GCN: BB{{[0-9]+}}_2: ; %if +; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: ds_read_b32 ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll index 80907bf1c1beb9..2ba4d0cf1d992c 100644 --- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -4,8 +4,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}convergent_inlineasm: ; GCN: %bb.0: ; GCN: v_cmp_ne_u32_e64 -; GCN: ; mask branch -; GCN: BB{{[0-9]+_[0-9]+}}: +; GCN: s_cbranch_execz +; GCN: ; %bb.{{[0-9]+}}: define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -23,9 +23,9 @@ bb5: ; preds = %bb3, %bb } ; GCN-LABEL: {{^}}nonconvergent_inlineasm: -; GCN: ; mask branch +; GCN: s_cbranch_execz -; GCN: BB{{[0-9]+_[0-9]+}}: +; GCN: ; %bb.{{[0-9]+}}: ; GCN: v_cmp_ne_u32_e64 ; GCN: BB{{[0-9]+_[0-9]+}}: diff --git a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll index 6b3491b0c75dc9..5209b2bf7f3c79 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll @@ -8,7 +8,7 @@ ; CHECK: s_mov_b32 [[SREG:s[0-9]+]], 1.0 ; CHECK: %bb.1: ; CHECK-NOT: v_mov_b32_e32 {{v[0-9]+}}, 1.0 -; CHECK: BB0_4: +; CHECK: BB0_3: ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, [[SREG]] define amdgpu_ps void @mov_opt(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 895539c00bce97..563b5dcead5c48 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -32,7 +32,6 @@ define amdgpu_ps void @main(i32, float) { ; CHECK-NEXT: s_and_b64 s[8:9], s[8:9], exec ; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz BB0_6 ; CHECK-NEXT: BB0_3: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec @@ -44,21 +43,19 @@ define amdgpu_ps void @main(i32, float) { ; CHECK-NEXT: s_mov_b64 s[6:7], -1 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc ; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; CHECK-NEXT: ; mask branch BB0_1 ; CHECK-NEXT: s_cbranch_execz BB0_1 -; CHECK-NEXT: BB0_5: ; %endif2 +; CHECK-NEXT: ; %bb.5: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_add_i32 s0, s0, 1 ; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: s_branch BB0_1 -; CHECK-NEXT: BB0_6: ; %Flow2 +; CHECK-NEXT: ; %bb.6: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] -; CHECK-NEXT: ; mask branch BB0_8 -; CHECK-NEXT: BB0_7: ; %if1 +; CHECK-NEXT: ; %bb.7: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 -; CHECK-NEXT: BB0_8: ; %endloop +; CHECK-NEXT: ; %bb.8: ; %endloop ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index 38c9379fe2df71..e641f4263bb137 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -5,7 +5,6 @@ ; CHECK: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] -; CHECK-NEXT: ; mask branch define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 @@ -32,7 +31,7 @@ end: ; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]] ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] -; CHECK-NEXT: ; mask branch +; CHECK-NEXT: s_cbranch_execz define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 diff --git a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll index 76a26882987baf..08936730fc398a 100644 --- a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll @@ -9,7 +9,7 @@ ; CHECK-NOT: v_cmp ; CHECK_NOT: v_cndmask ; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]] -; CHECK: BB0_2: +; CHECK: ; %bb.2: define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir index b305cfddb5a5d2..76bb74d3a63f09 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 -verify-machineinstrs %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir index c84372086fd3c1..11051198f235e4 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir index 7da59df5d80cc6..a0c0a6f2052266 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-insert-skips -amdgpu-skip-threshold=2 %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=2 %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir index bcc6e12d4ee781..c8832caf616366 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 %s -o - | FileCheck %s # https://bugs.freedesktop.org/show_bug.cgi?id=99019 --- | define amdgpu_ps void @kill_uncond_branch() { diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 7864ca31112211..3b5d4d5628900d 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -158,7 +158,7 @@ entry: ; W64: s_mov_b64 exec, [[SAVEEXEC]] ; W64: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] -; W64: BB{{[0-9]+_[0-9]+}}: +; W64: ; %bb.{{[0-9]+}}: ; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 ; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec @@ -204,7 +204,7 @@ entry: ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] ; W32: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] -; W32: BB{{[0-9]+_[0-9]+}}: +; W32: ; %bb.{{[0-9]+}}: ; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 ; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo @@ -270,7 +270,7 @@ entry: ; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill ; W64-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] -; W64-O0: BB{{[0-9]+_[0-9]+}}: +; W64-O0: ; %bb.{{[0-9]+}}: ; W64-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec ; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index af34b0f3987280..d6b717411de791 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -58,9 +58,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] -; GFX9-NEXT: ; mask branch BB1_4 ; GFX9-NEXT: s_cbranch_execz BB1_4 -; GFX9-NEXT: BB1_1: ; %bb19 +; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 ; GFX9-NEXT: v_add_u32_e32 v6, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll index ffa851919f7091..84749170dc6d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -11,12 +11,11 @@ ; GCN-NEXT: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] -; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb +; GCN: ; %bb.{{[0-9]+}}: ; %unreachable.bb ; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec ; GCN-NEXT: [[RET_BB]]: @@ -55,11 +54,17 @@ ret.bb: ; preds = %else, %main_body } ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: -; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccz -; GCN: ; %bb.{{[0-9]+}}: ; %else +; GCN: ; %bb.{{[0-9]+}}: ; %Flow +; GCN: s_cbranch_execnz [[RETURN:BB[0-9]+_[0-9]+]] + +; GCN: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_waitcnt + +; GCN: BB{{[0-9]+_[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]] ; GCN-NEXT: ; %unreachable.bb ; GCN: ds_write_b32 @@ -67,12 +72,6 @@ ret.bb: ; preds = %else, %main_body ; GCN: ; %ret.bb ; GCN: store_dword - -; GCN: ; %UnifiedReturnBlock -; GCN-NEXT: s_or_b64 exec, exec -; GCN-NEXT: s_waitcnt -; GCN-NEXT: ; return -; GCN-NEXT: .Lfunc_end define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <8 x i32>] addrspace(4)* inreg %arg2, i32 addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { main_body: %i.i = extractelement <2 x i32> %arg7, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll index a99d18147ccf1f..e854c089268ff4 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -40,8 +40,6 @@ bb5: ; preds = %bb3, %bb1 ; GCN: load_dwordx4 ; GCN: v_cmp_nlt_f32 ; GCN: s_and_saveexec_b64 -; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[UNIFIED_RET]]: ; GCN-NEXT: s_endpgm ; GCN: .Lfunc_end define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll index ce85a66634044c..64f1824b890640 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -3,13 +3,12 @@ ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: ; GCN: v_cmp_eq_u32 ; GCN: s_and_saveexec_b64 -; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]] -; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 ; GCN: ; divergent unreachable -; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %UnifiedReturnBlock ; GCN: s_endpgm define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 { @@ -29,13 +28,12 @@ ret: ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: ; GCN: v_cmp_ne_u32 ; GCN: s_and_saveexec_b64 -; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]] -; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable +; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 ; GCN: ; divergent unreachable -; GCN: [[RETURN]]: +; GCN: ; %bb.{{[0-9]+}}: ; GCN-NEXT: s_endpgm define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir index fdb0c465c20be9..b360f3aa5ffb64 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir @@ -32,7 +32,7 @@ body: | ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc ; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN: SI_MASK_BRANCH %bb.2, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec ; GCN: S_BRANCH %bb.1 ; GCN: bb.1: ; GCN: successors: %bb.2(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir index 554094cf9c0204..9574edd0af98b8 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-insert-skips -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-insert-skips -amdgpu-skip-threshold-legacy=1000000 -o - %s | FileCheck %s --- name: skip_branch_taildup_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll index bce4023316fad0..7b146e0317837d 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll @@ -5,9 +5,8 @@ ; An s_cbranch_execnz is required to avoid trapping if all lanes are 0 ; GCN-LABEL: {{^}}trap_divergent_branch: ; GCN: s_and_saveexec_b64 -; GCN: s_cbranch_execz [[ENDPGM:BB[0-9]+_[0-9]+]] -; GCN: s_branch [[TRAP:BB[0-9]+_[0-9]+]] -; GCN: [[ENDPGM]]: +; GCN: s_cbranch_execnz [[TRAP:BB[0-9]+_[0-9]+]] +; GCN: ; %bb.{{[0-9]+}}: ; GCN-NEXT: s_endpgm ; GCN: [[TRAP]]: ; GCN: s_trap 2 @@ -30,7 +29,7 @@ end: ; GCN-LABEL: {{^}}debugtrap_divergent_branch: ; GCN: s_and_saveexec_b64 ; GCN: s_cbranch_execz [[ENDPGM:BB[0-9]+_[0-9]+]] -; GCN: BB{{[0-9]+}}_{{[0-9]+}}: +; GCN: ; %bb.{{[0-9]+}}: ; GCN: s_trap 3 ; GCN-NEXT: [[ENDPGM]]: ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index e848e75fc00596..115782863efca7 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -220,10 +220,9 @@ exit: ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc ; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] -; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: s_cbranch_execz [[EXIT]] +; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] -; CHECK: {{BB[0-9]+_[0-9]+}}: ; %bb.preheader +; CHECK: ; %bb.{{[0-9]+}}: ; %bb.preheader ; CHECK: s_mov_b32 ; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]: @@ -357,20 +356,18 @@ bb7: ; preds = %bb4 ; CHECK: ; %bb.0: ; CHECK: s_and_saveexec_b64 ; CHECK: s_xor_b64 -; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, -; CHECK: [[BB4]]: +; CHECK: BB{{[0-9]+_[0-9]+}}: ; CHECK: s_or_b64 exec, exec ; CHECK: image_sample_c ; CHECK: v_cmp_neq_f32_e32 vcc, 0, ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc -; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: s_cbranch_execz [[END]] +; CHECK-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]] ; CHECK-NOT: branch -; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8 +; CHECK: ; %bb.{{[0-9]+}}: ; %bb8 ; CHECK: buffer_store_dword ; CHECK: [[END]]: diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll index 4ba16b4eb30bea..c376886a3e80f7 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; GCN-LABEL: BB0_1 +; GCN-LABEL: ; %bb.0: ; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0 ; GCN: s_waitcnt lgkmcnt(0) ; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 00ae166a6ce51a..be60a34b420891 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -28,9 +28,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: ; mask branch BB0_2 ; GCN-NEXT: s_cbranch_execz BB0_2 -; GCN-NEXT: BB0_1: ; %if.then4.i +; GCN-NEXT: ; %bb.1: ; %if.then4.i ; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen ; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 3127201e9224ea..70b7d06e442308 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -18,14 +18,13 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc -; CHECK-NEXT: ; mask branch BB0_2 -; CHECK-NEXT: BB0_1: ; %ift +; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: BB0_2: ; %ife +; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index 9655263406553d..2c64b1bdb3d203 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -335,7 +335,7 @@ endif: ; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: ; mask branch [[ENDIF_LABEL]] +; GCN: s_cbranch_execz [[ENDIF_LABEL]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index a1cf6cf630048e..a23eb2b137db19 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -5,7 +5,6 @@ ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cmp_ne_u32_e32 vcc, 0 ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: ; %loop_body @@ -33,7 +32,6 @@ out: ; CHECK-LABEL: {{^}}test2: ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index ea74268dbe7c2a..481929a0438f77 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -13,19 +13,16 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] -; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] -; SI-NEXT: s_cbranch_execz [[FLOW_BB]] -; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 +; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3 ; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: s_and_saveexec_b64 -; SI-NEXT: ; mask branch +; SI-NEXT: s_cbranch_execnz ; v_mov should be after exec modification -; SI: [[FLOW_BB]]: +; SI: ; %bb.{{[0-9]+}}: ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] -; SI-NEXT: ; mask branch ; define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: @@ -65,10 +62,9 @@ end: ; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; SI-NEXT: s_cbranch_execz [[EXIT]] +; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] -; SI-NEXT: BB{{[0-9]+_[0-9]+}}: +; SI-NEXT: ; %bb.{{[0-9]+}}: ; SI: buffer_store_dword ; SI-NEXT: {{^}}[[EXIT]]: @@ -92,10 +88,9 @@ exit: ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; SI-NEXT: s_cbranch_execz [[EXIT]] +; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] -; SI-NEXT: BB{{[0-9]+_[0-9]+}}: +; SI-NEXT: ; %bb.{{[0-9]+}}: ; SI: buffer_store_dword ; SI-NEXT: {{^}}[[EXIT]]: @@ -122,23 +117,22 @@ exit: ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] +; SI: s_cbranch_execnz [[EXIT:BB[0-9]+_[0-9]+]] -; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit -; SI: ds_write_b32 - -; SI-NEXT: {{^}}[[FLOW]]: +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %Flow ; SI-NEXT: s_or_saveexec_b64 ; SI-NEXT: s_xor_b64 exec, exec -; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] -; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN]] +; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] -; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then +; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then ; SI: s_waitcnt ; SI-NEXT: buffer_store_dword ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock ; SI: s_endpgm + +; SI-NEXT: {{^}}[[EXIT]]: +; SI: ds_write_b32 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %is.0 = icmp ne i32 %tid, 0 @@ -157,7 +151,6 @@ exit: ; SI-LABEL: {{^}}simple_test_v_loop: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI-NEXT: ; mask branch ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} @@ -199,11 +192,10 @@ exit: ; SI: buffer_load_dword [[VBOUND:v[0-9]+]] ; SI: v_cmp_lt_i32_e32 vcc ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI-NEXT: ; mask branch ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; Initialize inner condition to false -; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader +; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; Clear exec bits for workitems that load -1s @@ -214,9 +206,9 @@ exit: ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] -; SI: ; mask branch [[LABEL_FLOW:BB[0-9]+_[0-9]+]] +; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] -; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 +; SI: ; %bb.{{[0-9]+}}: ; %bb20 ; SI: buffer_store_dword ; SI: [[LABEL_FLOW]]: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index ce73f9c89dbfea..b05a8acf90161f 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) { ; GCN-LABEL: {{^}}test_mask_if: ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} -; GCN: ; mask branch +; GCN: s_cbranch_execz define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 { %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp ugt i32 %lid, 10 @@ -175,19 +175,18 @@ endif: ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} ; GCN: s_cbranch_execz -; GCN: BB{{.*}}: +; GCN: ; %bb.{{[0-9]+}}: ; GCN: BB{{.*}}: ; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}} ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] -; GCN: ; mask branch BB -; GCN: BB{{.*}}: -; GCN: BB{{.*}}: +; GCN: ; %bb.{{[0-9]+}}: +; GCN: ; %bb.{{[0-9]+}}: ; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}} ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}} ; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}] ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} -; GCN: ; mask branch BB -; GCN: BB{{.*}}: +; GCN: s_cbranch_execz BB +; GCN: ; %bb.{{[0-9]+}}: ; GCN: BB{{.*}}: ; GCN: s_endpgm define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 { @@ -228,9 +227,8 @@ bb13: ; GCN-LABEL: {{^}}test_loop_with_if_else_break: ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} -; GCN: ; mask branch ; GCN: s_cbranch_execz -; GCN: BB{{.*}}: +; GCN: ; %bb.{{[0-9]+}}: ; %.preheader ; GCN: BB{{.*}}: ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index b799c2b5993d5d..f3a44a78e89638 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -425,9 +425,8 @@ END: ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] -;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] -;CHECK-NEXT: s_cbranch_execz [[END_BB]] -;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE +;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]] +;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE ;CHECK: store_dword ;CHECK: [[END_BB]]: ; %END ;CHECK: s_or_b64 exec, exec,