diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index a366db1c580ba..519b376e12629 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", "Use true 16-bit registers" >; +def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32", + "EnableD16Writes32BitVgpr", + "true", + "D16 instructions potentially have 32-bit data dependencies" +>; + def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts", "HasBF16TransInsts", "true", @@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet< FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureMemoryAtomicFAddF32DenormalSupport, - FeatureRealTrue16Insts]>; + FeatureRealTrue16Insts, + FeatureD16Writes32BitVgpr, +]>; // There are few workarounds that need to be // added to all targets. This pessimizes codegen @@ -2570,6 +2578,11 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, + AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; +def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, + AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>; + def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, AssemblerPredicate<(all_of FeatureBF16TransInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 73acb1ddbd2a7..26e0b3dfc2e8a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; } +bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const { + return EnableD16Writes32BitVgpr; +} + // Returns the maximum per-workgroup LDS allocation size (in bytes) that still // allows the given function to achieve an occupancy of NWaves waves per // SIMD / EU, taking into account only the function's *maximum* workgroup size. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 57b757c990e1a..ed03ef21b6dda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -59,6 +59,7 @@ class AMDGPUSubtarget { bool HasCvtPkF16F32Inst = false; bool HasF32ToF16BF16ConversionSRInsts = false; bool EnableRealTrue16Insts = false; + bool EnableD16Writes32BitVgpr = false; bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; bool HasBF16PackedInsts = false; @@ -224,6 +225,8 @@ class AMDGPUSubtarget { // supported and the support for fake True16 instructions is removed. bool useRealTrue16Insts() const; + bool hasD16Writes32BitVgpr() const; + bool hasBF16TransInsts() const { return HasBF16TransInsts; } bool hasBF16ConversionInsts() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b163a274396ff..82e0aaed59261 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -845,6 +845,15 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); assert(Size % 16 == 0); Result.second = Result.first + (Size / 16); + + if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) { + // Regardless of which lo16/hi16 is used, consider the full 32-bit + // register used. + if (AMDGPU::isHi16Reg(MCReg, *TRI)) + Result.first -= 1; + else + Result.second += 1; + } } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) { // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar // sources like SRC_PRIVATE_BASE. diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 46b82d3a3d651..1ce7179774349 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l @@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l @@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l @@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l @@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l @@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h @@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l @@ -147648,7 +147528,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4 ; GFX11-TRUE16-NEXT: .LBB88_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h @@ -147667,7 +147546,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l @@ -147988,10 +147866,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -148008,10 +147884,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l @@ -148019,7 +147893,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h @@ -148031,10 +147904,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -148051,10 +147922,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l @@ -148068,17 +147937,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -148096,10 +147962,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l @@ -173957,6 +173821,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l @@ -173988,7 +173853,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h @@ -174005,69 +173869,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l @@ -174081,7 +173913,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4 ; GFX11-TRUE16-NEXT: .LBB92_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h @@ -174100,7 +173931,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l @@ -174421,10 +174251,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -174441,10 +174269,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l @@ -174452,7 +174278,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h @@ -174464,10 +174289,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -174484,10 +174307,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l @@ -174501,17 +174322,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -174529,10 +174347,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l @@ -196529,6 +196345,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l @@ -196560,7 +196377,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h @@ -196577,69 +196393,37 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l @@ -196653,7 +196437,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4 ; GFX11-TRUE16-NEXT: .LBB96_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h @@ -196672,7 +196455,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l @@ -196993,10 +196775,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -197013,10 +196793,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l @@ -197024,7 +196802,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h @@ -197036,10 +196813,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -197056,10 +196831,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l @@ -197073,17 +196846,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -197101,10 +196871,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 436b1a038b274..2abb2f3b9de52 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l @@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l @@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l @@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 @@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l @@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 @@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l @@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l @@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index ede44e738fe00..352b2cb7123b1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -12492,6 +12492,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l @@ -12523,39 +12524,22 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -27377,6 +27361,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l @@ -27408,39 +27393,22 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -41534,6 +41502,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l @@ -41565,39 +41534,22 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -54837,6 +54789,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l @@ -54868,39 +54821,22 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -68501,6 +68437,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -68533,37 +68470,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l @@ -68710,6 +68634,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -68717,7 +68642,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h @@ -68732,11 +68656,10 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -68756,7 +68679,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h @@ -80726,6 +80648,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -80758,37 +80681,24 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l @@ -80935,6 +80845,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -80942,7 +80853,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h @@ -80957,11 +80867,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -80981,7 +80890,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h @@ -91233,6 +91141,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -91265,37 +91174,24 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l @@ -91442,6 +91338,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -91449,7 +91346,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h @@ -91464,11 +91360,10 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -91488,7 +91383,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll index dd389375b0d77..6bebc8f5d0d18 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll @@ -23,9 +23,9 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: global_store_b16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v1, v0, s[2:3] ; GFX11-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] offset:2 ; GFX11-NEXT: .LBB0_2: ; %bb3 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index d374ed072cdc6..d9ac2d80af920 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1109,6 +1109,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b ; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 ; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index ccdc0b1bf43c4..a84872d8eac0f 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -1561,8 +1561,8 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 305461ed6b208..049663a1e1bb4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1685,19 +1685,18 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5 ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8 +; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l -; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0 ; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_endpgm @@ -1977,13 +1976,12 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l @@ -2726,10 +2724,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l -; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 25020673bce22..0a1d15bf945f9 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -376,9 +376,8 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, v2.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.h, v0.l diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 74ac181c120b5..448585afd2405 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -333,9 +333,8 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l ; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 91c88ec5e718c..b538d6066d551 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1528,8 +1528,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 ; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-TRUE16-NEXT: s_endpgm @@ -1559,8 +1559,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l ; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 3d21860e2af40..0e45df223465d 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() { ; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc ; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l ; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l ; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc ; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc