diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1077398bee4cd..abaa82809e28f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -596,9 +596,12 @@ static bool fnegFoldsIntoOp(const SDNode *N) { // TODO: Is there a benefit to checking the conditions performFNegCombine // does? We don't for the other cases. SDValue BCSrc = N->getOperand(0); - return BCSrc.getOpcode() == ISD::BUILD_VECTOR && - BCSrc.getNumOperands() == 2 && - BCSrc.getOperand(1).getValueSizeInBits() == 32; + if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { + return BCSrc.getNumOperands() == 2 && + BCSrc.getOperand(1).getValueSizeInBits() == 32; + } + + return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; } return fnegFoldsIntoOpcode(Opc); @@ -4182,6 +4185,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, return Result; } + if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32) { + // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> + // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) + SDValue LHS = + DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); + SDValue RHS = + DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); + + SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); + SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, MVT::f32, + BCSrc.getOperand(0), NegLHS, NegRHS); + if (!BCSrc.hasOneUse()) + DAG.ReplaceAllUsesWith(BCSrc, + DAG.getNode(ISD::FNEG, SL, VT, NewSelect)); + return NewSelect; + } + return SDValue(); } default: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 4a3e916dfdea4..5667d330fd25f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -3023,16 +3023,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, 1, s4 -; SI-NEXT: s_cselect_b32 s3, 0, s3 -; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_bitcmp1_b32 s4, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec +; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, 0, s3 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -3042,16 +3043,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, 1, s4 -; VI-NEXT: s_cselect_b32 s3, 0, s3 -; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: s_bitcmp1_b32 s4, 0 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec +; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: s_cselect_b32 s3, 0, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -3067,9 +3069,9 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_bfrev_b32_e32 v3, 1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -3221,19 +3223,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, 1, s4 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; SI-NEXT: s_cmp_eq_u32 s4, 1 -; SI-NEXT: s_cselect_b32 s3, 0, s3 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; SI-NEXT: s_cmp_eq_u32 s4, 1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_cselect_b32 s2, 0, s3 -; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -3243,19 +3243,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, 1, s4 -; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s3, 0, s3 -; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_cselect_b32 s2, 0, s3 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -3271,11 +3269,10 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_bfrev_b32_e32 v3, 1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 4f2bba843d945..2c753aa081704 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -398,8 +398,7 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) { ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_xor_select_f64: @@ -407,10 +406,10 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, double %arg0, double %arg1 %fneg = fneg double %select @@ -422,12 +421,12 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1] -; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -435,12 +434,12 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -448,12 +447,12 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off ; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -497,14 +496,13 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: select_fneg_select_fneg_f64: @@ -513,16 +511,13 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = fneg double %arg0 %select0 = select i1 %cond0, double %arg1, double %fneg0 @@ -894,10 +889,9 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) { ; GCN-NEXT: v_and_b32_e32 v5, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -909,12 +903,11 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc_lo ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %i = and i32 %arg, 1 %i3 = icmp eq i32 %i, 0