diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25e1eabb2c293..071c9406a1517 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1389,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -2049,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2839,6 +2843,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 3d7e678d2e54f..39b42002b907a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 04773c9c7b773..d51cee2b94ae0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5357a375ae5a9..b824c66931288 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -273,6 +273,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -1521,9 +1522,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 88ec0aa0d840b..22447d33aad75 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -57,6 +57,14 @@ class V_MUL_PROF : VOP3_Profile

{ def V_LSHL_ADD_U64_PROF : VOP3_Profile; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile; +def V_MAD_U32_PROF: VOP3_Profile { + let HasExtVOP3DPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile; +def VOP_I32_I32_I64_DPP : VOP3_Profile> { + let HasClamp = 1; +} } // End HasExt64BitDPP = 1; //===----------------------------------------------------------------------===// @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -848,6 +865,9 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats; + def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -1746,6 +1766,10 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; + defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 6cc192c570f8a..3daae98961bff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -801,15 +801,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] -; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8 +; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4 +; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5 +; GFX1250-NEXT: v_mov_b32_e32 v8, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11] -; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -1206,11 +1206,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11] ; GFX1250-NEXT: v_mov_b32_e32 v12, v1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1220,15 +1220,13 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo -; GFX1250-NEXT: v_mov_b32_e32 v1, v6 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo +; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_mov_b32_e32 v2, v7 -; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, v6 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i128 %num, %den ret i128 %result @@ -2856,90 +2854,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0 -; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0 +; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 ; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17] ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17] ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17] ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21] -; GFX1250-NEXT: v_mov_b32_e32 v20, v19 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo -; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0 -; GFX1250-NEXT: v_mov_b32_e32 v21, v22 -; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10 -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 +; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21] +; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21] ; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19] -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] ; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] ; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2 ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19] ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15 ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2 ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2 ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2 +; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1] -; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo +; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, v16 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i256 %num, %den ret i256 %result @@ -3004,7 +3001,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_load_b32 v2, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -3195,7 +3192,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_load_b32 v2, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll index eb28e6f513c62..05a0b1a8fdfec 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -1,9 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -< %s | FileCheck --check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -< %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GFX9-LABEL: mad_i32_vvv: @@ -22,6 +23,11 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v2 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -35,6 +41,34 @@ define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: mad_i32_sss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_sss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 s0, s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_sss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mul_i32 s0, s0, s1 +; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_sss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s1 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -58,6 +92,11 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) { ; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 42 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 42 %cast = bitcast i32 %add to float @@ -83,6 +122,11 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) { ; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0x12d687 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 1234567 %cast = bitcast i32 %add to float @@ -108,6 +152,11 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) { ; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0xffffffffffed2979 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvi_neg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0xffed2979 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, -1234567 %cast = bitcast i32 %add to float @@ -130,6 +179,11 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) { ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, 42, v[1:2] ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vcv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, 42, v1 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, 42 %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -152,6 +206,11 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) { ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, 42, 43 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vcc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, 42, 43 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, 42 %add = add i32 %mul, 43 %cast = bitcast i32 %add to float @@ -175,6 +234,11 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) { ; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, v1, s0 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -197,6 +261,11 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) { ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, s0, v[1:2] ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vsv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, s0, v1 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -219,6 +288,11 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, s0, v0, v[1:2] ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_svv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, s0, v0, v1 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -244,6 +318,11 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) { ; GFX11-NEXT: s_mov_b32 s2, s1 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, s[2:3] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, v0, s0, s1 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -269,6 +348,11 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) { ; GFX11-NEXT: s_mov_b32 s2, s1 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_svs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, s0, v0, s1 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -292,6 +376,11 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, s0, s1, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_ssv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mad_u32 v0, s0, s1, v0 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -322,6 +411,14 @@ define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: flat_store_b32 v[0:1], v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: mad_i32_vvv_multiuse: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v2 +; GFX1250-NEXT: flat_store_b32 v[0:1], v1 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c store i32 %mul, ptr poison diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 8d3716ef62f7c..7e3d5c97391e1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -3548,28 +3548,27 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX1250-NEXT: s_wait_loadcnt 0x1 ; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4 ; GFX1250-NEXT: v_mul_u64_e32 v[6:7], v[0:1], v[6:7] -; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX1250-NEXT: v_mul_u64_e32 v[8:9], v[8:9], v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7] -; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[6:7], v2, v4, v[6:7] ; GFX1250-NEXT: v_mov_b32_e32 v10, v9 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11] -; GFX1250-NEXT: v_add3_u32 v7, v3, v7, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v5, v0, v[10:11] +; GFX1250-NEXT: v_mad_u32 v0, v3, v4, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_u32 v7, v2, v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v4, v1, v[12:13] ; GFX1250-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v14, v13 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v1, v[10:11] ; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7] ; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index 789d6f892762b..b67c6d570d217 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -16,6 +16,57 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] v_lshl_add_u64 v[2:3], v[4:5], v7, 12345 // GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +v_mad_u32 v2, s4, v7, v8 +// GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_u32 v2, v4, 0, 1 +// GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02] + +v_mad_u32 v2, v4, 3, s2 +// GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00] + +v_mad_u32 v2, s4, 4, v2 +// GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04] + +v_mad_u32 v2, v4, v7, 12345 +// GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_u64_u32 v[2:3], v4, 0, 1 +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02] + +v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04] + +v_mad_nc_u64_u32 v[2:3], v4, v7, 12345 +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_i64_i32 v[2:3], v4, 0, 1 +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02] + +v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00] + +v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04] + +v_mad_nc_i64_i32 v[2:3], v4, v7, 12345 +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] + v_cvt_pk_bf16_f32 v5, v1, v2 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index e1165faf59d9c..5157020fcc675 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -16,6 +16,57 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] v_lshl_add_u64 v[2:3], v[4:5], v7, 12345 // GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +v_mad_u32 v2, s4, v7, v8 +// GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_u32 v2, v4, 0, 1 +// GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02] + +v_mad_u32 v2, v4, 3, s2 +// GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00] + +v_mad_u32 v2, s4, 4, v2 +// GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04] + +v_mad_u32 v2, v4, v7, 12345 +// GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_u64_u32 v[2:3], v4, 0, 1 +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02] + +v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04] + +v_mad_nc_u64_u32 v[2:3], v4, v7, 12345 +// GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp +// GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04] + +v_mad_nc_i64_i32 v[2:3], v4, 0, 1 +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02] + +v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00] + +v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04] + +v_mad_nc_i64_i32 v[2:3], v4, v7, 12345 +// GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp +// GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] + v_cvt_pk_bf16_f32 v5, v1, v2 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s index e2fafe415ff7f..b18249d7ee0a8 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s @@ -5,7 +5,40 @@ v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] dpp8:[7,6,5,4,3,2,1,0] // GFX125X-ERR-NEXT:{{^}}v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] dpp8:[7,6,5,4,3,2,1,0] // GFX125X-ERR-NEXT:{{^}} ^ +v_mad_u32 v2, v4, v7, v8 dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX125X-ERR-NEXT:{{^}}v_mad_u32 v2, v4, v7, v8 dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ + +v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX125X-ERR-NEXT:{{^}}v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ + +v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX125X-ERR-NEXT:{{^}}v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] dpp8:[7,6,5,4,3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ + v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] quad_perm:[3,2,1,0] // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX125X-ERR-NEXT:{{^}}v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] quad_perm:[3,2,1,0] // GFX125X-ERR-NEXT:{{^}} ^ + +v_mad_u32 v2, v4, v7, v8 quad_perm:[3,2,1,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share +// GFX125X-ERR-NEXT:{{^}}v_mad_u32 v2, v4, v7, v8 quad_perm:[3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ + +v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share +// GFX125X-ERR-NEXT:{{^}}v_mad_nc_u64_u32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ + +v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0] +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1251-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: DP ALU dpp only supports row_share +// GFX125X-ERR-NEXT:{{^}}v_mad_nc_i64_i32 v[4:5], v2, v5, v[6:7] quad_perm:[3,2,1,0] +// GFX125X-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index a1a1d0c5d7ed2..c81d89df8c903 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -17,6 +17,57 @@ 0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 # GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] +0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04 +# GFX1250: v_mad_u32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x08,0x09,0x04] + +0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04 +# GFX1250: v_mad_u32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0e,0x22,0x04] + +0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02 +# GFX1250: v_mad_u32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x01,0x05,0x02] + +0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00 +# GFX1250: v_mad_u32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x07,0x09,0x00] + +0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 +# GFX1250: v_mad_u32 v2, v4, v7, 0x3039 ; encoding: [0x02,0x00,0x35,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04 +# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x08,0x09,0x04] + +0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04 +# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02 +# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x01,0x05,0x02] + +0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00 +# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x07,0x09,0x00] + +0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 +# GFX1250: v_mad_nc_u64_u32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfa,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04 +# GFX1250: v_mad_nc_u64_u32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfa,0xd6,0x04,0x0e,0x22,0x04] + +0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04 +# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, 4, v[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x08,0x09,0x04] + +0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04 +# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0e,0x22,0x04] + +0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02 +# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 0, 1 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x01,0x05,0x02] + +0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00 +# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, 3, s[2:3] ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x07,0x09,0x00] + +0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00 +# GFX1250: v_mad_nc_i64_i32 v[2:3], v4, v7, 0x3039 ; encoding: [0x02,0x00,0xfb,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00] + +0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04 +# GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04] + 0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf # GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]