diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 25c4375a73ce0..e624088a0964e 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -4150,7 +4150,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const { continue; Register OtherSrcReg, OtherReg; unsigned OtherSrcSubReg = 0, OtherSubReg = 0; - if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg, + if (!isMoveInstr(*TRI, &MI, OtherSrcReg, OtherReg, OtherSrcSubReg, OtherSubReg)) return false; if (OtherReg == SrcReg) diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 4894932d3c9b1..99c540366fb12 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -803,20 +803,20 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none ; CHECK-SD-NEXT: smlal2 v4.2d, v16.4s, v20.4s ; CHECK-SD-NEXT: smlal v6.2d, v16.2s, v20.2s ; CHECK-SD-NEXT: smlal v3.2d, v16.2s, v19.2s -; CHECK-SD-NEXT: smlal2 v1.2d, v16.4s, v18.4s +; CHECK-SD-NEXT: smlal2 v0.2d, v16.4s, v18.4s ; CHECK-SD-NEXT: smlal v7.2d, v16.2s, v17.2s -; CHECK-SD-NEXT: smlal v0.2d, v16.2s, v18.2s +; CHECK-SD-NEXT: smlal v1.2d, v16.2s, v18.2s ; CHECK-SD-NEXT: smlal2 v5.2d, v16.4s, v17.4s ; CHECK-SD-NEXT: b.ne .LBB6_7 ; CHECK-SD-NEXT: // %bb.8: // %middle.block -; CHECK-SD-NEXT: add v0.2d, v0.2d, v6.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d ; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d ; CHECK-SD-NEXT: cmp x10, x9 -; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d ; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d -; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d -; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: b.eq .LBB6_15 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll index 7542e9c4b8f5b..a4f20905a85c2 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll @@ -35,15 +35,15 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) { ; CHECK-LABEL: check_deinterleaving_has_deinterleave: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: mov w9, #32 // =0x20 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: mov w9, #32 // =0x20 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: movi v5.2d, #0000000000000000 -; CHECK-NEXT: movi v7.2d, #0000000000000000 ; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: movi v7.2d, #0000000000000000 ; CHECK-NEXT: movi v16.2d, #0000000000000000 ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -64,31 +64,31 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) { ; CHECK-NEXT: ushll v24.4s, v18.4h, #0 ; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0 ; CHECK-NEXT: ushll v20.4s, v20.4h, #0 -; CHECK-NEXT: and v21.16b, v21.16b, v1.16b -; CHECK-NEXT: and v19.16b, v19.16b, v1.16b -; CHECK-NEXT: and v22.16b, v22.16b, v1.16b -; CHECK-NEXT: and v17.16b, v17.16b, v1.16b -; CHECK-NEXT: and v23.16b, v23.16b, v1.16b -; CHECK-NEXT: and v24.16b, v24.16b, v1.16b -; CHECK-NEXT: and v18.16b, v18.16b, v1.16b -; CHECK-NEXT: and v20.16b, v20.16b, v1.16b -; CHECK-NEXT: add v4.4s, v4.4s, v19.4s -; CHECK-NEXT: add v2.4s, v2.4s, v21.4s -; CHECK-NEXT: add v0.4s, v0.4s, v22.4s -; CHECK-NEXT: add v3.4s, v3.4s, v17.4s +; CHECK-NEXT: and v21.16b, v21.16b, v2.16b +; CHECK-NEXT: and v19.16b, v19.16b, v2.16b +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v17.16b, v17.16b, v2.16b +; CHECK-NEXT: and v23.16b, v23.16b, v2.16b +; CHECK-NEXT: and v24.16b, v24.16b, v2.16b +; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b +; CHECK-NEXT: add v5.4s, v5.4s, v19.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v22.4s +; CHECK-NEXT: add v4.4s, v4.4s, v17.4s ; CHECK-NEXT: add v16.4s, v16.4s, v23.4s -; CHECK-NEXT: add v5.4s, v5.4s, v24.4s -; CHECK-NEXT: add v6.4s, v6.4s, v20.4s -; CHECK-NEXT: add v7.4s, v7.4s, v18.4s +; CHECK-NEXT: add v6.4s, v6.4s, v24.4s +; CHECK-NEXT: add v7.4s, v7.4s, v20.4s +; CHECK-NEXT: add v0.4s, v0.4s, v18.4s ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: add v1.4s, v7.4s, v3.4s -; CHECK-NEXT: add v3.4s, v16.4s, v4.4s -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v2.4s, v16.4s, v5.4s +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 4f00aed3aa4bc..ddeeca7d5df50 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -31,14 +31,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -205,20 +205,20 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ldr z18, [x1, #3, mul vl] ; CHECK-NEXT: ldr z19, [x1, #2, mul vl] ; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d ; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d ; CHECK-NEXT: fadd z1.d, z4.d, z5.d ; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: faddv d0, p0, z1.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index aed3072bb4af3..355adec955e4b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll @@ -25,14 +25,14 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q3, q2, [x9] ; CHECK-NEXT: cmp x8, #1600 ; CHECK-NEXT: ldp q5, q4, [x10] -; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0 -; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90 -; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: faddp d1, v2.2d ; CHECK-NEXT: ret @@ -159,20 +159,20 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q17, q16, [x8], #64 ; CHECK-NEXT: ldp q19, q18, [x9], #64 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 -; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #0 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90 -; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90 -; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #90 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d -; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d -; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d -; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v4.2d, v0.2d, v3.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v2.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: faddp d1, v1.2d diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index 338084295fc7f..0fe4683d97a23 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,8 +16,9 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: lsr w9, w9, #1 +; CHECK-NEXT: mov w9, w11 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 52a77cb396909..6c6a691760af3 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2( %arg) { define <4 x i1> @extract_v4i1_nxv32i1_0( %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: umov w9, v1.b[2] ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: umov w8, v1.b[2] -; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] +; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 0) ret <4 x i1> %ext diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index 72994100b2970..1cefe96962e29 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1( %inmask) { define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov w9, v1.s[2] ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, v1.s[2] -; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1( %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: umov w9, v1.h[2] ; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v1.h[4] -; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v1.h[6] -; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] +; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( %inmask, i64 0) ret <8 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 8e807cda7166d..41e4a38fad90b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov w9, v1.s[2] ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, v1.s[2] -; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 935189dec48ac..74a717f1635a3 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x8, x0, #16 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: add x0, x0, #16 -; CHECK-BE-NEXT: add x9, x8, #48 -; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] +; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x8] -; CHECK-BE-NEXT: add x8, x8, #32 +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] +; CHECK-BE-NEXT: mov x0, x8 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x8] -; CHECK-BE-NEXT: st1 { v1.4s }, [x0] +; CHECK-BE-NEXT: st1 { v0.4s }, [x10] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x10, x1, #48 +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: add x8, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] -; CHECK-BE-NEXT: add x9, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x9, x0, #96 +; CHECK-BE-NEXT: add x8, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 +; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x9] +; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x9, x0, #112 +; CHECK-BE-NEXT: add x8, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x10] -; CHECK-BE-NEXT: add x10, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x9] +; CHECK-BE-NEXT: add x9, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: add x0, x0, #64 -; CHECK-BE-NEXT: st1 { v17.2d }, [x9] -; CHECK-BE-NEXT: add x9, x8, #48 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: st1 { v19.2d }, [x10] -; CHECK-BE-NEXT: st1 { v5.2d }, [x0] +; CHECK-BE-NEXT: st1 { v17.2d }, [x8] +; CHECK-BE-NEXT: add x8, x0, #64 +; CHECK-BE-NEXT: st1 { v19.2d }, [x9] +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x8] +; CHECK-BE-NEXT: st1 { v4.2d }, [x10] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,14 +3093,13 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: mov x8, x0 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: add x8, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x10, x0, #48 -; CHECK-BE-NEXT: add x0, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3114,10 +3113,11 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x8] -; CHECK-BE-NEXT: st1 { v6.4s }, [x9] -; CHECK-BE-NEXT: st1 { v7.4s }, [x10] -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] +; CHECK-BE-NEXT: st1 { v5.4s }, [x0] +; CHECK-BE-NEXT: mov x0, x10 +; CHECK-BE-NEXT: st1 { v6.4s }, [x8] +; CHECK-BE-NEXT: st1 { v7.4s }, [x9] +; CHECK-BE-NEXT: st1 { v4.4s }, [x10] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x8, x0, #16 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: add x0, x0, #16 -; CHECK-BE-NEXT: add x9, x8, #48 -; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] +; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x8] -; CHECK-BE-NEXT: add x8, x8, #32 +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] +; CHECK-BE-NEXT: mov x0, x8 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x8] -; CHECK-BE-NEXT: st1 { v1.4s }, [x0] +; CHECK-BE-NEXT: st1 { v0.4s }, [x10] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index c1e6b4fffa82d..8372d22b72afc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX10-NEXT: s_mov_b32 s8, exec_lo -; GFX10-NEXT: s_mov_b32 s9, s5 ; GFX10-NEXT: s_add_i32 s6, s6, 1 -; GFX10-NEXT: s_xor_b32 s5, s5, s8 +; GFX10-NEXT: s_xor_b32 s8, s5, s8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_and_b32 s8, exec_lo, s9 -; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_and_b32 s9, exec_lo, s5 +; GFX10-NEXT: s_mov_b32 s5, s8 +; GFX10-NEXT: s_or_b32 s7, s7, s9 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 9a90faf723461..7bd1ff2201977 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -78,13 +78,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v6, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v3, v6, v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -127,14 +126,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v6, v1, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -224,14 +222,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v6, v0, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -523,28 +520,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] -; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 +; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 3eecaccf0308f..43ff9ee1ad27d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -3131,8 +3131,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; @@ -3143,8 +3143,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -3155,8 +3155,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3176,8 +3176,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mov_b32_e32 v5, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] +; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 88e3c86c791de..50e28a7245db8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mov_b32_e32 v0, v4 -; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] -; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mov_b32_e32 v0, v4 -; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] -; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 0a098eb6582c7..a9938f17dacb7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mov_b32_e32 v0, v4 -; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mov_b32_e32 v0, v4 -; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index b6eaaf1369ab4..d499b3d5576d7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -326,12 +326,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -347,12 +347,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -440,12 +440,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -462,12 +462,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -880,14 +880,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] +; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -914,14 +913,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -938,14 +936,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -970,13 +968,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -992,13 +990,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1014,13 +1012,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1036,13 +1034,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1065,14 +1063,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4] +; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1099,14 +1096,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1123,14 +1119,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 +; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1155,13 +1151,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1177,13 +1173,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1199,13 +1195,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1222,13 +1218,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1333,30 +1329,30 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX942-NEXT: ds_read_b32 v3, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v3, v3 +; GFX942-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16: @@ -1467,30 +1463,30 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16: @@ -1721,30 +1717,30 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v2, v1 +; GFX942-NEXT: ds_read_b32 v3, v1 ; GFX942-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 -; GFX942-NEXT: v_not_b32_e32 v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1861,30 +1857,30 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 +; GFX90A-NEXT: ds_read_b32 v3, v1 ; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_f16__offset: @@ -2036,27 +2032,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2077,28 +2073,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2123,15 +2119,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2144,27 +2140,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2179,28 +2175,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2215,23 +2211,23 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2253,15 +2249,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2282,15 +2278,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2312,16 +2308,16 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2342,18 +2338,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2374,18 +2370,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2416,19 +2412,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2459,20 +2455,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2498,15 +2493,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2529,19 +2524,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2566,20 +2561,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2602,16 +2596,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2634,15 +2628,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2664,15 +2658,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2695,16 +2689,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2726,18 +2720,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2759,18 +2753,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2852,19 +2846,19 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2955,19 +2949,19 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3092,16 +3086,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3124,17 +3118,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3154,13 +3147,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3175,16 +3168,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3201,17 +3194,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3228,15 +3220,15 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3253,13 +3245,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3275,13 +3267,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3297,14 +3289,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3320,16 +3312,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3346,16 +3338,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3491,27 +3483,27 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16: @@ -3658,25 +3650,25 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_bf16: @@ -3950,27 +3942,27 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4123,25 +4115,25 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4305,38 +4297,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4357,37 +4349,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4413,22 +4405,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4441,38 +4433,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4487,37 +4479,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4532,28 +4524,28 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4576,20 +4568,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4611,20 +4603,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4646,22 +4638,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4682,18 +4674,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4714,18 +4706,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4756,30 +4748,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4810,29 +4801,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4859,22 +4849,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4898,29 +4888,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4946,28 +4935,27 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4990,21 +4978,21 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5028,20 +5016,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5064,20 +5052,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5100,22 +5088,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5137,18 +5125,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5170,18 +5158,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5283,14 +5271,13 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5305,6 +5292,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5421,14 +5409,13 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5442,6 +5429,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5581,27 +5569,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5624,26 +5611,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5661,24 +5647,24 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5694,26 +5680,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5731,25 +5716,24 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5766,21 +5750,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5798,20 +5782,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5828,20 +5812,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5857,21 +5841,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5887,16 +5871,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5913,16 +5897,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6010,17 +5994,17 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-LABEL: local_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6234,17 +6218,17 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6415,14 +6399,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6439,13 +6422,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6461,12 +6444,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6481,12 +6464,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6502,14 +6485,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6631,14 +6614,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6655,13 +6637,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6677,12 +6659,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6697,12 +6679,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6718,14 +6700,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6980,40 +6962,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_v2bf16: @@ -7334,40 +7316,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fadd_ret_v2bf16__offset: @@ -7565,32 +7547,30 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7611,32 +7591,30 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7656,27 +7634,27 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7696,26 +7674,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7734,26 +7712,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7771,29 +7749,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7910,32 +7888,30 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7956,32 +7932,30 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8001,27 +7975,27 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8041,26 +8015,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8079,26 +8053,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8116,29 +8090,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8875,20 +8849,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow21 @@ -8999,20 +8973,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow19 @@ -9703,20 +9677,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v2, v1 +; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow21 @@ -9827,20 +9801,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ; %bb.5: ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v2, v1 +; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow19 @@ -10110,12 +10084,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10131,12 +10105,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 8e094a7269a49..282c754473047 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -886,21 +886,21 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16: @@ -1025,21 +1025,21 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f16: @@ -1285,21 +1285,21 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1430,21 +1430,21 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1598,29 +1598,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,21 +1989,20 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2034,21 +2033,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2074,16 +2073,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2106,21 +2105,20 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2145,21 +2143,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2182,17 +2180,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2215,16 +2213,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2246,16 +2244,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2278,17 +2276,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2310,18 +2308,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2343,18 +2341,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2439,13 +2437,12 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 @@ -2453,6 +2450,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2547,13 +2545,12 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 @@ -2561,6 +2558,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2687,18 +2685,17 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2721,18 +2718,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2752,14 +2749,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2774,18 +2771,17 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2802,18 +2798,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2830,16 +2826,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2856,14 +2852,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2879,14 +2875,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2902,15 +2898,15 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2926,16 +2922,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2952,16 +2948,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3097,27 +3093,27 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16: @@ -3264,25 +3260,25 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_bf16: @@ -3558,27 +3554,27 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3731,25 +3727,25 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3915,38 +3911,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3967,37 +3963,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4023,22 +4019,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4051,38 +4047,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4097,37 +4093,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4142,28 +4138,28 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4186,20 +4182,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4221,20 +4217,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4256,22 +4252,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4292,19 +4288,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4325,19 +4321,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4368,30 +4364,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4422,29 +4417,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4471,22 +4465,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4510,29 +4504,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4558,28 +4551,27 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4602,21 +4594,21 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4640,20 +4632,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4676,20 +4668,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4712,22 +4704,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4749,19 +4741,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4783,19 +4775,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4897,14 +4889,13 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -4919,6 +4910,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5035,14 +5027,13 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5056,6 +5047,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5197,27 +5189,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5240,26 +5231,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5280,21 +5270,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5310,26 +5300,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5347,25 +5336,24 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5382,21 +5370,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5414,20 +5402,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5444,20 +5432,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5473,21 +5461,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5503,17 +5491,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5530,17 +5518,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5592,25 +5580,25 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-LABEL: local_atomic_fmax_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16: @@ -5668,24 +5656,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-LABEL: local_atomic_fmax_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16: @@ -5864,25 +5852,25 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5940,24 +5928,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -6113,15 +6101,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6141,14 +6129,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6164,15 +6152,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6190,14 +6178,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6214,13 +6202,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6236,13 +6224,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6260,16 +6248,16 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6375,15 +6363,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6403,14 +6391,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6426,15 +6414,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6452,14 +6440,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6476,13 +6464,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6498,13 +6486,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6522,16 +6510,16 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6738,41 +6726,41 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16: @@ -6910,40 +6898,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2bf16: @@ -7216,41 +7204,41 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7388,40 +7376,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -7601,34 +7589,31 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7653,33 +7638,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7702,27 +7686,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7740,32 +7724,30 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7786,32 +7768,30 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7831,27 +7811,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7871,26 +7851,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7909,26 +7889,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7946,29 +7926,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8067,34 +8047,31 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8119,33 +8096,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8168,27 +8144,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8206,32 +8182,30 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8252,32 +8226,30 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8297,27 +8269,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8337,26 +8309,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8375,26 +8347,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8412,29 +8384,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 0aa8d33ea7429..4a6428eb338ff 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -886,21 +886,21 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16: @@ -1025,21 +1025,21 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f16: @@ -1285,21 +1285,21 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1430,21 +1430,21 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1598,29 +1598,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -1641,29 +1641,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -1688,16 +1688,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1710,29 +1710,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1747,29 +1747,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1784,24 +1784,24 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -1823,16 +1823,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1853,16 +1853,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1884,17 +1884,17 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1915,18 +1915,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1947,18 +1947,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1989,21 +1989,20 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2034,21 +2033,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2074,16 +2073,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2106,21 +2105,20 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2145,21 +2143,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2182,17 +2180,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -2215,16 +2213,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2246,16 +2244,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2278,17 +2276,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2310,18 +2308,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2343,18 +2341,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2439,13 +2437,12 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 @@ -2453,6 +2450,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2547,13 +2545,12 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 @@ -2561,6 +2558,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2687,18 +2685,17 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2721,18 +2718,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2752,14 +2749,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2774,18 +2771,17 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2802,18 +2798,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2830,16 +2826,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -2856,14 +2852,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2879,14 +2875,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2902,15 +2898,15 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2926,16 +2922,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2952,16 +2948,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3097,27 +3093,27 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16: @@ -3264,25 +3260,25 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_bf16: @@ -3558,27 +3554,27 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3731,25 +3727,25 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3915,38 +3911,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3967,37 +3963,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4023,22 +4019,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4051,38 +4047,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4097,37 +4093,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4142,28 +4138,28 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -4186,20 +4182,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4221,20 +4217,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4256,22 +4252,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4292,19 +4288,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4325,19 +4321,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4368,30 +4364,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4422,29 +4417,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4471,22 +4465,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4510,29 +4504,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4558,28 +4551,27 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4602,21 +4594,21 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -4640,20 +4632,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4676,20 +4668,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4712,22 +4704,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4749,19 +4741,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4783,19 +4775,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4897,14 +4889,13 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -4919,6 +4910,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5035,14 +5027,13 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5056,6 +5047,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5197,27 +5189,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5240,26 +5231,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5280,21 +5270,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5310,26 +5300,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5347,25 +5336,24 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5382,21 +5370,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -5414,20 +5402,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5444,20 +5432,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5473,21 +5461,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5503,17 +5491,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5530,17 +5518,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5592,25 +5580,25 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-LABEL: local_atomic_fmin_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16: @@ -5668,24 +5656,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-LABEL: local_atomic_fmin_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16: @@ -5864,25 +5852,25 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5940,24 +5928,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -6113,15 +6101,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6141,14 +6129,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6164,15 +6152,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6190,14 +6178,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -6214,13 +6202,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6236,13 +6224,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6260,16 +6248,16 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6375,15 +6363,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6403,14 +6391,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6426,15 +6414,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6452,14 +6440,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -6476,13 +6464,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6498,13 +6486,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6522,16 +6510,16 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 +; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6738,41 +6726,41 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16: @@ -6910,40 +6898,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2bf16: @@ -7216,41 +7204,41 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7388,40 +7376,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -7601,34 +7589,31 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7653,33 +7638,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7702,27 +7686,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7740,32 +7724,30 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7786,32 +7768,30 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7831,27 +7811,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7871,26 +7851,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7909,26 +7889,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7946,29 +7926,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8067,34 +8047,31 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8119,33 +8096,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8168,27 +8144,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8206,32 +8182,30 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8252,32 +8226,30 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8297,27 +8269,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8337,26 +8309,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8375,26 +8347,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8412,29 +8384,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 929bb61ddabcf..7f95e7df3f6b5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -50,17 +50,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-LABEL: local_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -119,17 +119,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-LABEL: local_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -262,17 +262,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -331,17 +331,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -453,14 +453,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -479,12 +478,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -499,14 +498,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -523,13 +521,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 @@ -545,12 +543,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -565,12 +563,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -586,12 +584,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -607,12 +605,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -628,12 +626,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -656,14 +654,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -682,12 +679,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -702,14 +699,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -726,13 +722,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -748,12 +744,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -768,12 +764,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -789,12 +785,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -810,12 +806,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -832,12 +828,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -887,18 +883,18 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX942-LABEL: local_atomic_fsub_ret_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_read_b64 v[4:5], v0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: ds_read_b64 v[0:1], v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -957,18 +953,18 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-LABEL: local_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b64 v[4:5], v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1104,18 +1100,18 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-LABEL: local_atomic_fsub_ret_f64__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_read_b64 v[4:5], v0 offset:65528 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1174,18 +1170,18 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-LABEL: local_atomic_fsub_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b64 v[4:5], v0 offset:65528 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0 ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1300,14 +1296,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] +; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1326,12 +1321,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1346,14 +1341,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1370,14 +1364,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1393,12 +1387,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] +; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1413,13 +1407,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1435,13 +1429,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1457,13 +1451,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1479,13 +1473,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] +; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1508,14 +1502,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4] +; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1534,12 +1527,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 -; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 +; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 +; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1554,14 +1547,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528 +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1578,14 +1570,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 +; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1601,12 +1593,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528 +; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0 +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1621,13 +1613,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1643,13 +1635,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1665,13 +1657,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528 +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1688,13 +1680,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], -4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0 +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1799,30 +1791,30 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX942-NEXT: ds_read_b32 v3, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v3, v3 +; GFX942-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16: @@ -1933,30 +1925,30 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16: @@ -2187,30 +2179,30 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX942-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX942-NEXT: ds_read_b32 v2, v1 +; GFX942-NEXT: ds_read_b32 v3, v1 ; GFX942-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0 -; GFX942-NEXT: v_not_b32_e32 v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX942-NEXT: v_not_b32_e32 v2, v2 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2327,30 +2319,30 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX90A-NEXT: ds_read_b32 v2, v1 +; GFX90A-NEXT: ds_read_b32 v3, v1 ; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4 -; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2502,27 +2494,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2543,28 +2535,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2589,15 +2581,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2610,27 +2602,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2645,28 +2637,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -2681,23 +2673,23 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 @@ -2719,15 +2711,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2748,15 +2740,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2778,16 +2770,16 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2808,18 +2800,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2840,18 +2832,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2882,19 +2874,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -2925,20 +2917,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -2964,15 +2955,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2995,19 +2986,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3032,20 +3023,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3068,16 +3058,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3100,15 +3090,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3130,15 +3120,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3161,16 +3151,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3192,18 +3182,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3225,18 +3215,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3318,19 +3308,19 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3421,19 +3411,19 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3558,16 +3548,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3590,17 +3580,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3620,13 +3609,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3641,16 +3630,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3667,17 +3656,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3694,15 +3682,15 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 @@ -3719,13 +3707,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3741,13 +3729,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3763,14 +3751,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3786,16 +3774,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3812,16 +3800,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3957,27 +3945,27 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16: @@ -4124,25 +4112,25 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_bf16: @@ -4416,27 +4404,27 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4589,25 +4577,25 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4771,38 +4759,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4823,37 +4811,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4879,22 +4867,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4907,38 +4895,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4953,37 +4941,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -4998,28 +4986,28 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ds_read_b32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: ds_read_b32 v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX10-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5042,20 +5030,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5077,20 +5065,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5112,22 +5100,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5148,18 +5136,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5180,18 +5168,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5222,30 +5210,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5276,29 +5263,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5325,22 +5311,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5364,29 +5350,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5412,28 +5397,27 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5456,21 +5440,21 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5494,20 +5478,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5530,20 +5514,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5566,22 +5550,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5603,18 +5587,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5636,18 +5620,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5749,14 +5733,13 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1 ; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5771,6 +5754,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5887,14 +5871,13 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -5908,6 +5891,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6047,27 +6031,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6090,26 +6073,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6130,21 +6112,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6160,26 +6142,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6197,25 +6178,24 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6232,21 +6212,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6264,20 +6244,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6294,20 +6274,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6322,22 +6302,22 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6353,16 +6333,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6379,16 +6359,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6438,17 +6418,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-LABEL: local_atomic_fsub_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6507,17 +6487,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-LABEL: local_atomic_fsub_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6693,17 +6673,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6762,17 +6742,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6926,14 +6906,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6952,12 +6931,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6972,14 +6951,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6996,13 +6974,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -7018,12 +6996,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7038,12 +7016,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7059,14 +7037,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7171,14 +7149,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7197,12 +7174,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7217,14 +7194,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7241,13 +7217,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -7263,12 +7239,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7283,12 +7259,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7304,14 +7280,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7518,41 +7494,41 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7690,40 +7666,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_v2bf16: @@ -7996,41 +7972,41 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -8168,40 +8144,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -8381,34 +8357,31 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8433,33 +8406,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8482,27 +8454,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8520,32 +8492,30 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8566,32 +8536,30 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8611,27 +8579,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8651,26 +8619,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8689,26 +8657,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8726,29 +8694,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8847,34 +8815,31 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8899,33 +8864,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8948,27 +8912,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8986,32 +8950,30 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9032,32 +8994,30 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -9077,27 +9037,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9117,26 +9077,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9155,26 +9115,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9192,29 +9152,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 +; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9339,17 +9299,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9408,17 +9368,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX90A-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9529,14 +9489,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -9555,12 +9514,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB29_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9575,14 +9534,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9599,13 +9557,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -9621,12 +9579,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9641,12 +9599,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9662,12 +9620,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9683,12 +9641,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9704,12 +9662,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index bcece19ae5fdd..5c90957edd9f5 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -16,11 +16,11 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: s_or_saveexec_b32 s1, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1 -; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v4, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll index ce40085feb0d0..895b68b5a9145 100644 --- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -25,7 +25,8 @@ define i32 @test(i32, i32) local_unnamed_addr #0 { %11 = sub nsw i32 %7, %9 %12 = icmp slt i32 %10, %11 br i1 %12, label %5, label %13 -; CHECK: if r2 s> r1 goto -10 +; CHECK: r1 = r3 +; CHECK: if r2 s> r3 goto -10 ;