diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1ebfa297f4fc3..a581acd7ea73a 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -198,9 +198,8 @@ FunctionPass *llvm::createSIFoldOperandsPass() { bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { MachineInstr *MI = Fold.UseMI; - MachineOperand &Old = MI->getOperand(Fold.UseOpNo); - assert(Old.isReg()); - + MachineOperand *Old = &MI->getOperand(Fold.UseOpNo); + assert(Old->isReg()); const uint64_t TSFlags = MI->getDesc().TSFlags; if (Fold.isImm()) { @@ -211,7 +210,7 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is // already set. unsigned Opcode = MI->getOpcode(); - int OpNo = MI->getOperandNo(&Old); + int OpNo = MI->getOperandNo(Old); int ModIdx = -1; if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) ModIdx = AMDGPU::OpName::src0_modifiers; @@ -236,11 +235,11 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { if (!(Fold.ImmToFold & 0xffff)) { Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + Old->ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); return true; } Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); + Old->ChangeToImmediate(Fold.ImmToFold & 0xffff); return true; } break; @@ -251,7 +250,9 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { } } - if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { + if (Fold.needsShrink()) { + assert((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && "not handled"); + MachineBasicBlock *MBB = MI->getParent(); auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16); if (Liveness != MachineBasicBlock::LQR_Dead) { @@ -290,37 +291,40 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { if (Fold.Commuted) TII->commuteInstruction(*Inst32, false); - return true; - } - assert(!Fold.needsShrink() && "not handled"); + Fold.UseMI = Inst32; + Fold.UseOpNo = AMDGPU::getNamedOperandIdx(Fold.UseMI->getOpcode(), + AMDGPU::OpName::src0); + MI = Fold.UseMI; + Old = &MI->getOperand(Fold.UseOpNo); + } if (Fold.isImm()) { - if (Old.isTied()) { + if (Old->isTied()) { int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); if (NewMFMAOpc == -1) return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); } - Old.ChangeToImmediate(Fold.ImmToFold); + Old->ChangeToImmediate(Fold.ImmToFold); return true; } if (Fold.isGlobal()) { - Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), - Fold.OpToFold->getTargetFlags()); + Old->ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); return true; } if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + Old->ChangeToFrameIndex(Fold.FrameIndexToFold); return true; } MachineOperand *New = Fold.OpToFold; - Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); - Old.setIsUndef(New->isUndef()); + Old->substVirtReg(New->getReg(), New->getSubReg(), *TRI); + Old->setIsUndef(New->isUndef()); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 26d1fbb09210c..cd4b3150bd193 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -165,9 +165,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xffc0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -199,6 +198,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, 0xffffffc0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -232,6 +232,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0xffffffc0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll index a727ed39c79c6..c8c97dd072dc2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll @@ -101,7 +101,7 @@ define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) { define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) { ; VI-LABEL: add_shl_vgpr_const_inline_const: ; VI: ; %bb.0: -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x7e800 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7e800, v0 ; VI-NEXT: ; return to shader part epilog ; @@ -124,7 +124,7 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) { define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) { ; VI-LABEL: add_shl_vgpr_inline_const_x2: ; VI: ; %bb.0: -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0x600 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x600, v0 ; VI-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 701a733d9e8e9..59314737cf062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4865,6 +4865,7 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) { ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: v_mov_b32_e32 v0, 0x90 ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index cded5c94edf8c..3699206273574 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -231,14 +231,12 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -246,8 +244,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -512,15 +510,15 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 ; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 @@ -1026,7 +1024,7 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff800000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -1265,19 +1263,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -1286,19 +1282,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 ; GFX8-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -1383,26 +1377,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0x80000000, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x7fffffff, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -1411,26 +1404,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 0x80000000, v7 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x7fffffff, v6 ; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x7fffffff, v3 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 @@ -1536,26 +1528,24 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0x80000000, v9 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 @@ -1571,26 +1561,24 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 0x80000000, v9 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x7fffffff, v8 ; GFX8-NEXT: v_max_i32_e32 v4, v9, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 @@ -1724,34 +1712,32 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0x80000000, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_max_i32_e32 v5, v11, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x80000000, v10 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 @@ -1767,34 +1753,32 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_min_i32_e32 v11, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 0x80000000, v11 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 0x7fffffff, v10 +; GFX8-NEXT: v_max_i32_e32 v5, v11, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 +; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s5, v10 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 0x80000000, v10 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 @@ -2766,13 +2750,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 @@ -2780,8 +2762,8 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -2978,13 +2960,11 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 @@ -2992,8 +2972,8 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -3059,14 +3039,14 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 @@ -4135,12 +4115,12 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i48: @@ -4153,12 +4133,12 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i48: @@ -4170,8 +4150,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4342,12 +4323,12 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -4361,12 +4342,12 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -4379,8 +4360,9 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4437,12 +4419,12 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -4456,12 +4438,12 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -4474,8 +4456,9 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4529,8 +4512,9 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4543,8 +4527,9 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4557,8 +4542,9 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4702,8 +4688,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4716,8 +4703,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4730,8 +4718,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4774,8 +4763,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4788,8 +4778,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4802,8 +4793,9 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4846,9 +4838,9 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4856,8 +4848,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4870,9 +4862,9 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4880,8 +4872,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4894,9 +4886,9 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v10 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4904,8 +4896,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v10 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5677,6 +5669,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc ; GFX6-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] +; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5688,8 +5681,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v18 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5712,7 +5704,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v18 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc @@ -5729,6 +5721,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] +; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5740,8 +5733,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v18 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5764,7 +5756,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v18 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc @@ -5781,6 +5773,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v2, v10, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, v3, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5792,8 +5785,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v18 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5816,7 +5808,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v18 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index ab000d91a3ef2..bf658c9e72422 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -295,7 +295,7 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 @@ -345,7 +345,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 ; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, s8, v1 +; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] @@ -363,7 +363,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s8, 0x1000 +; CGP-NEXT: s_movk_i32 s6, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 ; CGP-NEXT: s_movk_i32 s4, 0xf000 @@ -375,35 +375,35 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0x1000, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v9, v3, s4 ; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 ; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v3 +; CGP-NEXT: v_lshlrev_b32_e32 v10, 12, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s6, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc @@ -437,7 +437,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 @@ -486,7 +486,7 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_subrev_i32_e32 v6, vcc, s8, v0 ; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v1 +; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] @@ -504,7 +504,7 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 ; CGP-NEXT: s_mov_b32 s4, 0xffed2705 @@ -516,35 +516,35 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0x12d8fb, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v9, v3, s4 ; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v3, s8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v3, s6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s6, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 88ace1c51f5b0..b945bd7ec9cde 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -268,10 +268,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -312,13 +312,13 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, s4, v1 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, s4, v1 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 @@ -344,15 +344,16 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0x1000, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 +; CGP-NEXT: v_mul_lo_u32 v9, v3, s5 ; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 @@ -361,19 +362,17 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -399,10 +398,10 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -443,13 +442,13 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, s4, v1 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, s4, v1 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 @@ -475,15 +474,16 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0x12d8fb, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 +; CGP-NEXT: v_mul_lo_u32 v9, v3, s5 ; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 @@ -492,19 +492,17 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index d0c55c69f5087..b5aaa244a8382 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1078,14 +1078,14 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0x1000, v6 ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x1000, v4 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0x1000, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc @@ -1699,14 +1699,14 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0x12d8fb, v6 ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x12d8fb, v4 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0x12d8fb, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 65455d754be4f..88b30a0430332 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -9,13 +9,13 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-LABEL: v_ssubsat_i7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 @@ -119,13 +119,13 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_ssubsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 @@ -231,23 +231,21 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -506,20 +504,20 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 @@ -1005,13 +1003,13 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX6-LABEL: v_ssubsat_i24: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 @@ -1026,7 +1024,7 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff800000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -1112,10 +1110,10 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-LABEL: v_ssubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1124,10 +1122,10 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-LABEL: v_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 -; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x80000000, v3 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x80000000, v3 ; GFX8-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1227,10 +1225,10 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX6-LABEL: ssubsat_i32_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1238,10 +1236,10 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: ssubsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1 -; GFX8-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x80000000, v2 +; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0x7fffffff, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x80000000, v2 ; GFX8-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1265,19 +1263,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -1286,19 +1282,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 @@ -1383,26 +1377,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6 ; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 0x80000000, v7 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1411,26 +1404,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6 ; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 0x80000000, v7 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1536,33 +1528,31 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x80000000, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v9 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 @@ -1571,33 +1561,31 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8 +; GFX8-NEXT: v_min_i32_e32 v10, -1, v0 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x80000000, v10 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX8-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s5, v8 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 +; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 +; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 -; GFX8-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v9 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 @@ -1724,41 +1712,37 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 +; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 0x80000000, v12 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x80000000, v10 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v11 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1767,41 +1751,37 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_ssubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 +; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 0x80000000, v12 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s5, v10 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x80000000, v10 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 +; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v11 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 -; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v11 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -2592,13 +2572,13 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX6-LABEL: v_ssubsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 @@ -2724,13 +2704,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-LABEL: ssubsat_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 @@ -2766,22 +2746,20 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2978,22 +2956,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -3056,16 +3032,16 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 @@ -4135,12 +4111,12 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i48: @@ -4153,12 +4129,12 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i48: @@ -4170,8 +4146,9 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4342,12 +4319,12 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -4361,12 +4338,12 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -4379,8 +4356,9 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4437,12 +4415,12 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xffff8000, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -4456,12 +4434,12 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 15, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -4474,8 +4452,9 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4529,8 +4508,9 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4543,8 +4523,9 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4557,8 +4538,9 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -4702,8 +4684,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4716,8 +4699,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4730,8 +4714,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4774,8 +4759,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4788,8 +4774,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4802,8 +4789,9 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4846,9 +4834,9 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4856,8 +4844,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4870,9 +4858,9 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4880,8 +4868,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4894,9 +4882,9 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v10 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc @@ -4904,8 +4892,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v10 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5706,6 +5694,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc ; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX6-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5719,8 +5708,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v20 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc @@ -5744,9 +5732,9 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v20 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc @@ -5762,6 +5750,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc ; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX8-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5775,8 +5764,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v20 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc @@ -5800,9 +5788,9 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc @@ -5818,6 +5806,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -5831,8 +5820,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v20 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc @@ -5856,9 +5844,9 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 48f05a33f0364..482baf3249ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -222,10 +222,10 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 +; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -242,6 +242,7 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0x12d8fb, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 @@ -253,18 +254,16 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i32_oddk_denom: @@ -273,25 +272,26 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: s_mov_b32 s4, 0x12d8fb ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 ; CGP-NEXT: s_mov_b32 s5, 0xffed2705 +; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0x12d8fb, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, s5 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v2, s5 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v4, v4, s4 ; CGP-NEXT: v_mul_lo_u32 v2, v2, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v1 +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 097f6642cbc66..00de46b168acb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -969,123 +969,123 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_urem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb +; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 +; CHECK-NEXT: s_mov_b32 s7, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0x12d8fb, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, v4, s5 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v7, s5, v3 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v6, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, s5 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, s7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, s7 +; CHECK-NEXT: v_mul_hi_u32 v9, s7, v3 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v3 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 +; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v6, v7, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v7, v3, s7 +; CHECK-NEXT: v_mul_hi_u32 v8, s7, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, s7 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 +; CHECK-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v3 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 +; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v6, v8, s[4:5] +; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, v0, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v4 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, s4 -; CHECK-NEXT: v_mul_hi_u32 v3, s4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v4, s4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, vcc, v1, v3, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v10, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, s6 +; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v6, v6, s6 +; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v6, v3 +; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v8 +; CHECK-NEXT: v_subb_u32_e64 v6, s[6:7], v1, v3, s[4:5] +; CHECK-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[6:7] ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] ; CHECK-NEXT: s_mov_b64 s[4:5], vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0x12d8fb, v2 ; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v2, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 1235195 ret i64 %result @@ -1095,217 +1095,217 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb +; GISEL-NEXT: s_mov_b32 s6, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: s_sub_u32 s5, 0, 0x12d8fb -; GISEL-NEXT: v_madmk_f32 v7, v6, 0x4f800000, v5 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: s_sub_u32 s7, 0, 0x12d8fb -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_madmk_f32 v7, v5, 0x4f800000, v6 ; GISEL-NEXT: s_subb_u32 s8, 0, 0 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0x12d8fb, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: s_subb_u32 s10, 0, 0 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_trunc_f32_e32 v10, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, s7, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, s5, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s7, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, s5, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, s5, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, s7, v5 -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v16, s7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v19, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v21, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v22, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v21, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v17 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s5, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, s5, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, s7, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, s7, v5 -; GISEL-NEXT: v_mul_lo_u32 v15, s5, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, s7, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v14 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v18, v5, v12 -; GISEL-NEXT: v_mul_lo_u32 v21, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v21, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v17 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v21, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v22, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v23, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v24, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v17, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v23, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v21 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v17, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v18, v19 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v22, v17 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v20 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v11, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v11, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, s7, v7 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v10, v12, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s7, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v20, s9, v10 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v22, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v17 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v14 +; GISEL-NEXT: v_mul_lo_u32 v23, v10, v14 +; GISEL-NEXT: v_mul_hi_u32 v24, v8, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v21, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v23, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 +; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v21, v19 +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v23, v20 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v18, v17 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v2, v8 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v10, v14, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, s4 -; GISEL-NEXT: v_mul_hi_u32 v6, s4, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, s4 -; GISEL-NEXT: v_mul_hi_u32 v5, s4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, s4 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, s4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v7, vcc, v1, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v12 -; GISEL-NEXT: v_subb_u32_e64 v8, vcc, v3, v5, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[8:9] +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v19, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v20, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v19, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v15, v11 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v18, v12 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v19, v14 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, s6 +; GISEL-NEXT: v_mul_hi_u32 v7, s6, v7 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, s6 +; GISEL-NEXT: v_mul_hi_u32 v8, s6, v8 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, s6 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, s6 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v14 +; GISEL-NEXT: v_subb_u32_e64 v10, s[8:9], v3, v8, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v3, s[8:9], v3, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[8:9] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[8:9] ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0x12d8fb, v4 ; GISEL-NEXT: v_sub_i32_e64 v12, s[6:7], v0, v4 ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 @@ -1316,231 +1316,231 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 ; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x12d8fb +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s5, 0xffed2705 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0x12d8fb, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v7, s5 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v8, s5 -; CGP-NEXT: v_mul_lo_u32 v11, v5, s5 -; CGP-NEXT: v_mul_hi_u32 v12, s5, v5 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v6, s5 -; CGP-NEXT: v_mul_hi_u32 v14, s5, v6 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v9 -; CGP-NEXT: v_mul_lo_u32 v18, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v20, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v21, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v22, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v18, v11 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v20 -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v21, v13 -; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v19 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v22 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v20, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v21, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v5, s5 -; CGP-NEXT: v_mul_hi_u32 v11, s5, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v6, s5 -; CGP-NEXT: v_mul_hi_u32 v12, s5, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v7, s5 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v8, s5 -; CGP-NEXT: v_mul_lo_u32 v17, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v18, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, v13, v5 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v16, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v20, v6, v12 -; CGP-NEXT: v_mul_lo_u32 v21, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v22, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v21, v10 -; CGP-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v19 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v22 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v20, v17 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v15, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v16, v2, v8 -; CGP-NEXT: v_mul_lo_u32 v17, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v17, v6 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v11, v5, s4 -; CGP-NEXT: v_mul_hi_u32 v5, s4, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v6, s4 -; CGP-NEXT: v_mul_hi_u32 v6, s4, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v7, v7, s4 -; CGP-NEXT: v_mul_lo_u32 v8, v8, s4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; CGP-NEXT: v_subb_u32_e64 v7, vcc, v1, v5, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v12 -; CGP-NEXT: v_subb_u32_e64 v8, vcc, v3, v6, s[6:7] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v4 -; CGP-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[8:9] +; CGP-NEXT: v_mul_lo_u32 v11, v9, s7 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v10, s7 +; CGP-NEXT: v_mul_lo_u32 v13, v8, s7 +; CGP-NEXT: v_mul_hi_u32 v14, s7, v8 +; CGP-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v7, s7 +; CGP-NEXT: v_mul_hi_u32 v16, s7, v7 +; CGP-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v7 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 +; CGP-NEXT: v_mul_lo_u32 v16, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 +; CGP-NEXT: v_mul_lo_u32 v19, v8, v11 +; CGP-NEXT: v_mul_lo_u32 v20, v9, v11 +; CGP-NEXT: v_mul_hi_u32 v21, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v22, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v23, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v24, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v20, v13 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v22 +; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v23, v15 +; CGP-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v21 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v24 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v19, v14 +; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v20, v17 +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v22, v16 +; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v23, v18 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 +; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v11, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v11, v8, s7 +; CGP-NEXT: v_mul_hi_u32 v13, s7, v8 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v15 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v10, v12, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v7, s7 +; CGP-NEXT: v_mul_hi_u32 v14, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v9, s7 +; CGP-NEXT: v_mul_lo_u32 v16, v9, v11 +; CGP-NEXT: v_mul_hi_u32 v17, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v18, v10, s7 +; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v20, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 +; CGP-NEXT: v_sub_i32_e64 v15, s[4:5], v15, v8 +; CGP-NEXT: v_sub_i32_e64 v18, s[4:5], v18, v7 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v18, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v21, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v22, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v23, v10, v14 +; CGP-NEXT: v_mul_hi_u32 v24, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v18, v11 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v19, s[4:5], v19, v22 +; CGP-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v23, v12 +; CGP-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v21 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v24 +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v17 +; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v22, v19 +; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v23, v20 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v17 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v13, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v0, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v10, v14, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v12, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v9 +; CGP-NEXT: v_mul_lo_u32 v16, v1, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v1, v9 +; CGP-NEXT: v_mul_lo_u32 v18, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v19, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v20, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v16, v8 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v19, v7 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v17 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v20 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v15, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v18, v12 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v19, v14 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v8, s6 +; CGP-NEXT: v_mul_hi_u32 v8, s6, v8 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 +; CGP-NEXT: v_mul_lo_u32 v14, v7, s6 +; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_mul_lo_u32 v9, v9, s6 +; CGP-NEXT: v_mul_lo_u32 v10, v10, s6 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 +; CGP-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; CGP-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v14 +; CGP-NEXT: v_subb_u32_e64 v10, s[8:9], v3, v7, s[6:7] +; CGP-NEXT: v_sub_i32_e64 v3, s[8:9], v3, v7 +; CGP-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[8:9] +; CGP-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v9 +; CGP-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[8:9] ; CGP-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[4:5] ; CGP-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: s_mov_b64 s[4:5], vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0x12d8fb, v4 ; CGP-NEXT: v_sub_i32_e64 v12, s[6:7], v0, v4 ; CGP-NEXT: v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7] ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 @@ -1551,20 +1551,20 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 ; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index aa1d44c31606b..be3762844d9ea 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -5,9 +5,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x200, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x400, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] @@ -50,8 +50,8 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x400, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x400, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x800, [[BASE]] @@ -94,9 +94,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x800, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x200, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] @@ -209,7 +209,7 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 @@ -283,9 +283,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x200, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x400, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] @@ -319,9 +319,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x800, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x400, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x200, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]] @@ -409,7 +409,7 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x800, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]] ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir index 2b5ec86244ec2..c872857df440b 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir @@ -15,9 +15,8 @@ body: | ; GCN-LABEL: name: shrink_vgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], [[COPY]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[COPY]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = COPY $vgpr0 @@ -40,9 +39,8 @@ body: | ; GCN-LABEL: name: shrink_vgpr_vgpr_fi_v_add_i32_e64_no_carry_out_use ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -115,9 +113,8 @@ body: | ; GCN-LABEL: name: shrink_sgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[COPY]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[COPY]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 %stack.0 %1:vgpr_32 = COPY $vgpr0 @@ -141,8 +138,7 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[COPY]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[COPY]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32_xm0 = S_MOV_B32 %stack.0 @@ -162,8 +158,8 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_imm_fi_vgpr_v_add_i32_e64_no_carry_out_use - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 16, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 16, implicit $exec @@ -204,8 +200,8 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_k_fi_vgpr_v_add_i32_e64_no_carry_out_use - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 1234, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec @@ -225,8 +221,8 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_k_vgpr_fi_v_add_i32_e64_no_carry_out_use - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 1234, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir index fc2d4807f72d4..da8c98501b1d1 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir @@ -11,9 +11,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_other_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:sreg_32_xm0 = S_MOV_B32 12345 @@ -33,12 +32,11 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_multi_use_with_used_carry - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc - ; GCN-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF1]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -61,9 +59,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_dbg_only_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: DBG_VALUE %5:sreg_64_xexec, $noreg ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 @@ -87,11 +84,10 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc ; GCN-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF1]], [[DEF2]], [[COPY]], 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADDC_U32_e64_]] diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir index 2bf0ceaa56818..ed966efaf06c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir @@ -9,9 +9,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_no_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -29,8 +28,7 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_scalar_imm_v_add_i32_e64_no_carry_out_use ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32_xm0 = S_MOV_B32 12345 @@ -46,9 +44,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -263,9 +260,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_sub_i32_e64_no_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -283,8 +279,7 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_scalar_imm_v_sub_i32_e64_no_carry_out_use ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[V_SUBREV_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_SUBREV_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_SUBREV_CO_U32_e32_]] %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32_xm0 = S_MOV_B32 12345 @@ -301,9 +296,8 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_subrev_i32_e64_no_carry_out_use - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_SUBREV_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_SUBREV_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_SUBREV_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -321,8 +315,7 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_scalar_imm_v_subrev_i32_e64_no_carry_out_use ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e32_]] %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32_xm0 = S_MOV_B32 12345 @@ -373,9 +366,8 @@ body: | ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -439,9 +431,8 @@ body: | ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_NOP 0, implicit-def $vcc - ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -472,8 +463,7 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: vcc_liveness_dbg_value_search_before - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 @@ -502,7 +492,7 @@ body: | ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF @@ -549,8 +539,7 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABEL: name: vcc_liveness_dbg_value_search_after - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 @@ -579,7 +568,7 @@ body: | ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 12345, [[DEF]], implicit-def $vcc, implicit $exec ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 ; GCN-NEXT: DBG_VALUE $noreg, 0 @@ -683,8 +672,7 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 killed [[V_MOV_B32_e32_]], [[COPY]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 518144, [[COPY]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec @@ -702,8 +690,7 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], killed [[COPY]], implicit-def $vcc, implicit $exec + ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 518144, killed [[COPY]], implicit-def $vcc, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e32_]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 61017e809c863..70a7f67f5b8d0 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -3273,9 +3273,8 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v0, v2 ; GFX67-SDAG-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX67-SDAG-NEXT: s_movk_i32 s4, 0x100 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 0x100, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v1 ; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 671ead6127308..440a0666aef8c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -397,9 +397,10 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -461,9 +462,10 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1537,12 +1539,13 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1600,8 +1603,8 @@ define i1 @isnormal_f16(half %x) { ; GFX7GLISEL-LABEL: isnormal_f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1725,18 +1728,19 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7c00, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], s8, v3 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], s8, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v3 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 -; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, v3 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] @@ -1798,18 +1802,19 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7c00, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], s8, v3 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], s8, v1 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v3 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v3 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 ; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] @@ -1921,14 +1926,15 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2038,14 +2044,15 @@ define i1 @not_iszero_f16(half %x) { ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v0 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2532,11 +2539,12 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2599,11 +2607,12 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2666,11 +2675,12 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2868,16 +2878,17 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v1 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 -; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v2 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s8, v0 +; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] @@ -2951,14 +2962,15 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 -; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00 +; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x400 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_sub_i32_e32 v0, vcc, 0x400, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a462c19ce645d..c2b10c160bf58 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -54,13 +54,11 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_movk_i32 s0, 0x2000 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x2800, v0 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: s_movk_i32 s0, 0x3000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x3000, v0 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3800, v0 @@ -132,8 +130,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX900-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 ; GFX900-NEXT: global_load_dwordx2 v[14:15], v[6:7], off ; GFX900-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x3000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 @@ -276,8 +273,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off ; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x3000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 @@ -573,21 +569,17 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v3, s35 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc -; GFX900-NEXT: s_movk_i32 s0, 0x5000 -; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, 0x5000, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX900-NEXT: s_movk_i32 s2, 0x7f ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: s_movk_i32 s0, 0xd000 -; GFX900-NEXT: s_movk_i32 s1, 0xe000 -; GFX900-NEXT: s_movk_i32 s3, 0xf000 +; GFX900-NEXT: s_movk_i32 s0, 0x7f ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX900-NEXT: v_mov_b32_e32 v6, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s1, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 @@ -599,23 +591,23 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off ; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v6, vcc ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v5 +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, 0xffffd000, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v6, vcc ; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v5 +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, 0xffffe000, v5 ; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off ; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v6, vcc ; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v5 +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, 0xfffff000, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v6, vcc ; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[29:30], v[5:6], off ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, 0x10000, v5 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX900-NEXT: s_addk_i32 s4, 0x2000 -; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff +; GFX900-NEXT: s_addk_i32 s1, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX900-NEXT: s_waitcnt vmcnt(8) ; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc @@ -649,11 +641,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_add_i32 s4, s2, -1 -; GFX900-NEXT: s_cmp_eq_u32 s2, 0 +; GFX900-NEXT: s_add_i32 s1, s0, -1 +; GFX900-NEXT: s_cmp_eq_u32 s0, 0 ; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_mov_b32 s2, s4 +; GFX900-NEXT: s_mov_b32 s0, s1 ; GFX900-NEXT: s_branch .LBB1_1 ; GFX900-NEXT: .LBB1_5: ; %while.end ; GFX900-NEXT: v_mov_b32_e32 v1, s35 @@ -805,19 +797,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v2, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x5000 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: s_movk_i32 s2, 0x7f +; GFX90A-NEXT: s_movk_i32 s0, 0x7f ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 -; GFX90A-NEXT: s_movk_i32 s0, 0xd000 -; GFX90A-NEXT: s_movk_i32 s1, 0xe000 -; GFX90A-NEXT: s_movk_i32 s3, 0xf000 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b32 s1, 0 ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 @@ -829,23 +817,23 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, 0xffffd000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, 0xffffe000, v6 ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[14:15], off ; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 ; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[20:21], off -; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s3, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, 0xfffff000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[22:23], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: s_addk_i32 s4, 0x2000 -; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff +; GFX90A-NEXT: s_addk_i32 s1, 0x2000 +; GFX90A-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX90A-NEXT: s_waitcnt vmcnt(8) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc @@ -879,11 +867,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX90A-NEXT: s_add_i32 s4, s2, -1 -; GFX90A-NEXT: s_cmp_eq_u32 s2, 0 +; GFX90A-NEXT: s_add_i32 s1, s0, -1 +; GFX90A-NEXT: s_cmp_eq_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX90A-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX90A-NEXT: s_mov_b32 s2, s4 +; GFX90A-NEXT: s_mov_b32 s0, s1 ; GFX90A-NEXT: s_branch .LBB1_1 ; GFX90A-NEXT: .LBB1_5: ; %while.end ; GFX90A-NEXT: v_mov_b32_e32 v1, s35 @@ -1163,10 +1151,8 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_movk_i32 s0, 0x1800 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x1c00 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x1c00, v0 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x2000 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v19, v[5:6] ; GFX8-NEXT: flat_load_dword v7, v[7:8] @@ -1175,7 +1161,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: flat_load_dword v10, v[13:14] ; GFX8-NEXT: flat_load_dword v11, v[15:16] ; GFX8-NEXT: flat_load_dword v12, v[17:18] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x2000, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x2400, v0 ; GFX8-NEXT: flat_load_dword v5, v[5:6] @@ -1230,10 +1216,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc ; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX900-NEXT: global_load_dword v5, v[0:1], off ; GFX900-NEXT: global_load_dword v6, v[0:1], off offset:1024 @@ -1357,8 +1342,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: global_load_dword v6, v[0:1], off offset:1024 @@ -1526,10 +1510,9 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_movk_i32 s0, 0xf000 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0xf800 ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 @@ -1804,11 +1787,9 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7ffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7ffffc00, v0 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] @@ -2348,13 +2329,11 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_movk_i32 s0, 0x2000 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x1800, v0 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x1000, v0 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x800, v0 @@ -2424,8 +2403,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 +; GFX900-NEXT: v_add_co_u32_e32 v12, vcc, 0x1000, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GFX900-NEXT: global_load_dwordx2 v[14:15], v[12:13], off ; GFX900-NEXT: global_load_dwordx2 v[16:17], v[4:5], off @@ -2571,8 +2549,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0x1000, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[12:13], off ; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[4:5], off @@ -2743,8 +2720,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x800, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v2 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc @@ -2784,10 +2760,9 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v8 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc ; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v3, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc @@ -2871,8 +2846,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4f2fd3f50494c..d79e4553f7574 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1394,8 +1394,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 0xffffffc5, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] @@ -1587,8 +1586,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] @@ -1722,13 +1720,12 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 0x7fff, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v10, vcc ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v7 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 8da720d7f991c..2be03b9083412 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -419,14 +419,11 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0x41, v2 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x41 +; SI-GISEL-NEXT: v_sub_i32_e32 v2, vcc, 0x41, v2 ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm ; @@ -452,17 +449,12 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x41 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0x41, v3 +; VI-GISEL-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -941,14 +933,11 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0xffffffef, v2 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffef +; SI-GISEL-NEXT: v_sub_i32_e32 v2, vcc, 0xffffffef, v2 ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm ; @@ -974,17 +963,12 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffef ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0xffffffef, v3 +; VI-GISEL-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; @@ -1931,12 +1915,12 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7b +; SI-GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0x7b, v3 +; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x7b, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2317,11 +2301,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xffffc400 +; SI-GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0xffffc400, v3 +; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffc400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2452,11 +2436,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 +; SI-GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0x4400, v3 +; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x4400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2591,11 +2575,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_movk_i32 s2, 0xffe0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, s2, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2753,11 +2736,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xffffffe0 ; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe0, v3 -; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2913,12 +2896,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xffffffe0 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe0, v3 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -3554,11 +3538,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_movk_i32 s2, 0xc400 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, s2, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3720,11 +3703,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_movk_i32 s2, 0x4400 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, s2, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3886,11 +3868,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_movk_i32 s2, 0x4000 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, s2, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4052,11 +4033,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_movk_i32 s2, 0xc000 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, s2, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, s2, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -4211,17 +4191,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm ; @@ -4368,15 +4344,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; SI-GISEL-NEXT: s_mov_b32 s6, 0 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s2, 0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 08db1e7fee259..dd4eb0ae2a09e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -5000,22 +5000,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 -; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x80 +; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x80, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 -; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x84 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x204 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x284 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 @@ -5043,1268 +5035,1268 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v2 ; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x74 +; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x100 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0xf4 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s5, 0x180 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s5, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x180, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x114 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x114 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x124 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x124 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x134 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x134 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x144 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x154 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x154 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x164 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x174 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s6, 0x200 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x200, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x194 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x194 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x1f4 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s7, 0x280 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s7, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x280, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x214 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x214 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x224 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x224 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x234 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x234 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x244 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x244 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x254 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x254 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x264 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x274 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s8, 0x300 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s8, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x300, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x294 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x294 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x2f4 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s9, 0x380 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s9, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x380, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x314 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x314 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x324 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x324 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x334 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x334 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x344 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x344 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x354 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x354 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x364 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x374 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s10, 0x400 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s10, v2 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v2 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x394 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x394 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:4080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x3f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x404 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x404 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x414 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x414 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x424 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x424 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x434 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x434 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x444 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x444 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x454 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x454 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x464 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x464 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x474 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x474 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x484 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x484 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x494 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x494 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x4f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x504 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x504 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x514 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x514 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x524 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x524 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x534 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x534 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x544 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x544 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x554 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x554 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x564 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x564 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x574 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x574 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x584 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x584 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x594 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x594 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x5f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x604 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x604 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x614 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x614 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x624 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x624 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x634 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x634 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x644 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x644 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x654 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x654 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x664 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x664 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x674 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x674 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x684 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x684 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x694 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x694 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x6f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x704 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x704 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x714 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x714 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x724 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x724 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x734 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x734 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x744 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x744 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x754 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x754 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x764 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x764 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x774 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x774 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x784 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x784 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x794 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x794 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x7f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x804 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x804 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x814 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x814 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x824 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x824 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x834 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x834 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x844 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x844 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x854 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x854 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x864 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x864 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x874 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x874 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x884 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x884 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x894 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x894 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x8f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x904 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x904 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x914 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x914 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x924 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x924 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x934 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x934 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x944 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x944 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x954 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x954 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x964 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x964 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x974 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x974 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x984 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x984 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x994 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x994 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x9f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xa94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xaa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xab4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xab4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xac4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xac4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xad4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xad4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xae4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xae4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xaf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xb94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xba4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xba4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xbf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xc94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xca4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xca4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xce4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xce4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xcf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xd94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xda4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xda4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xde4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xde4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xdf4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xe94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xea4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xea4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xeb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xec4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xec4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xed4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xed4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xee4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xee4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xef4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xef4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf04 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf04 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf14 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf14 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf24 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf24 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf34 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf34 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf44 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf44 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf54 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf54 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf64 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf74 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf74 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf84 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf84 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xf94 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf94 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfa4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfb4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfc4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfd4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xfe4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0xff4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xff4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1004 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1004 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1014 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1014 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1024 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1024 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1034 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1034 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1044 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1044 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1054 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1054 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1064 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1064 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1074 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1074 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1084 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1084 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1094 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1094 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x10f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1104 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1104 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1114 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1114 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1124 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1124 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1134 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1134 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1144 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1154 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1154 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1164 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1174 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1174 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1184 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1184 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1194 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1194 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x11f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1204 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1204 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1214 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1214 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1224 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1224 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1234 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1234 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1244 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1244 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1254 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1254 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1264 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1274 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1274 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1284 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1284 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1294 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1294 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x12f4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1304 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1304 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1314 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1314 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1324 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1324 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1334 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1334 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1344 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1344 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1354 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1354 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1364 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1374 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1374 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1384 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1384 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x1394 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1394 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13a4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13b4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13c4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13d4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064 -; GFX9-FLATSCR-NEXT: s_movk_i32 s11, 0x13e4 +; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080 ; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -7346,7 +7338,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s10, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7380,7 +7372,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s9, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x380, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7414,7 +7406,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s8, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x300, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7448,7 +7440,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s7, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x280, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x264 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7482,7 +7474,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x200, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1e4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7516,7 +7508,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s5, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x180, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x164 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -7550,7 +7542,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:3968 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 +; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 24319a639da44..4df9f7831e499 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1583,8 +1583,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc5, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] @@ -1774,8 +1773,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] @@ -1914,13 +1912,12 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 0x7fff, v10 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index e23f3cfad89bc..3d684be885f29 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1283,13 +1283,12 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 0x7fff, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index f68d14a32b929..2bd7560f91a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1307,13 +1307,12 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 0x7fff, v8 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6