diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index c2fca79979e1b..2158a4d0e2076 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -4078,18 +4078,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, // register. Mods |= SISrcMods::OP_SEL_1; - if (IsExtractHigh || - (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { - Mods |= SISrcMods::OP_SEL_0; + if (Src.getValueSizeInBits() == 16) { + if (isExtractHiElt(Src, Src)) { + Mods |= SISrcMods::OP_SEL_0; - // TODO: Should we try to look for neg/abs here? - } + // TODO: Should we try to look for neg/abs here? + return true; + } + + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + return true; + } + + if (Subtarget->useRealTrue16Insts()) + // In true16 mode, pack src to a 32bit + Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget); + } else if (IsExtractHigh) + Mods |= SISrcMods::OP_SEL_0; - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 6acbf52b97de5..680e7eb3de6be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -175,6 +175,40 @@ void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI, } } +void AMDGPUMCInstLower::lowerT16FmaMixFP16(const MachineInstr *MI, + MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); + const auto *TII = static_cast(ST.getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode, llvm::AMDGPU::OpName::vdst); + const MachineOperand &VDst = MI->getOperand(VDstIdx); + bool IsHi = AMDGPU::isHi16Reg(VDst.getReg(), TRI); + switch (Opcode) { + case AMDGPU::V_FMA_MIX_F16_t16: + Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_F16 : AMDGPU::V_FMA_MIXLO_F16; + break; + case AMDGPU::V_FMA_MIX_BF16_t16: + Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_BF16 : AMDGPU::V_FMA_MIXLO_BF16; + break; + } + int MCOpcode = TII->pseudoToMCOpcode(Opcode); + assert(MCOpcode != -1 && + "Pseudo instruction doesn't have a target-specific version"); + OutMI.setOpcode(MCOpcode); + + // lower operands + for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) { + const MachineOperand &MO = MI->getOperand(I); + MCOperand MCOp; + if (I == VDstIdx) + MCOp = MCOperand::createReg(TRI.get32BitRegister(VDst.getReg())); + else + lowerOperand(MO, MCOp); + OutMI.addOperand(MCOp); + } +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); const auto *TII = static_cast(ST.getInstrInfo()); @@ -201,6 +235,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } else if (AMDGPU::getT16D16Helper(Opcode)) { lowerT16D16Helper(MI, OutMI); return; + } else if (Opcode == AMDGPU::V_FMA_MIX_F16_t16 || + Opcode == AMDGPU::V_FMA_MIX_BF16_t16) { + lowerT16FmaMixFP16(MI, OutMI); + return; } int MCOpcode = TII->pseudoToMCOpcode(Opcode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 68b8d4e25a6cc..23ed55d45220f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -38,6 +38,7 @@ class AMDGPUMCInstLower { void lower(const MachineInstr *MI, MCInst &OutMI) const; void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const; + void lowerT16FmaMixFP16(const MachineInstr *MI, MCInst &OutMI) const; }; namespace { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70223da961e92..04dfd9b0da46a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9498,6 +9498,13 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { DescSize = Desc.getSize(); } + // If FMA Pseudo inst, get correct MC code size + if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) { + // All potential lowerings are the same size; arbitrarily pick one. + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index f7279b664ed27..52ee1e874ad86 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -64,6 +64,13 @@ class VOP3P_Mix_Profile + : VOP3P_Mix_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let DstRC64 = getVALUDstForVT.ret; +} + multiclass VOP3PInst { def NAME : VOP3P_Pseudo { } // end SubtargetPredicate = isGFX11Plus } +multiclass VOP3_VOP3PInst_t16 { + def NAME : VOP3P_Pseudo; + + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo { + let VOP3P = 1; + let PseudoInstr = OpName#"_dpp"; + } +} + let isReMaterializable = 1 in { let isCommutable = 1 in { defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile>; @@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile { +multiclass MadFmaMixFP32Pats { defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 @@ -189,7 +203,14 @@ multiclass MadFmaMixPats; +} +multiclass MadFmaMixFP16Pats { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); def : GCNPat < (AMDGPUclamp (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), @@ -243,9 +264,6 @@ multiclass MadFmaMixPats; +} - } // end True16Predicate +multiclass MadFmaMixFP16Pats_t16 { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + def : GCNPat < + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE) + >; - let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), - (vecVT (mixlo_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE) >; + def : GCNPat < - (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), - (vecVT (mixhi_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE) >; def : GCNPat < - (build_vector - VT:$elt0, - (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), - (vecVT (mixhi_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) + (AMDGPUclamp (build_vector + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE), lo16, + (mix_inst_16 $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE), hi16)) >; - } // end True16Predicate } class MinimumMaximumByMinimum3Maximum3VOP3P; +defm : MadFmaMixFP32Pats; +defm : MadFmaMixFP16Pats; } // OtherPredicates = [NoFP32Denormals] } // End SubtargetPredicate = HasMadMixInsts @@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile>; } + +// Pseudo true16 inst for v_fma_mixlo/hi_f16 +defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16>; } // End FPDPRounding = 1 } -defm : MadFmaMixPats; +defm : MadFmaMixFP32Pats; + +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in +defm : MadFmaMixFP16Pats; +let True16Predicate = UseRealTrue16Insts in +defm : MadFmaMixFP16Pats_t16; } let SubtargetPredicate = HasFmaMixBF16Insts in { @@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile>; } + +// Pseudo true16 inst for v_fma_mixlo/hi_bf16 +defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16>; } // End FPDPRounding = 1 } // End isCommutable = 1 -defm : MadFmaMixPats; +defm : MadFmaMixFP32Pats; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in +defm : MadFmaMixFP16Pats; +let True16Predicate = UseRealTrue16Insts in +defm : MadFmaMixFP16Pats_t16; } // End SubtargetPredicate = HasFmaMixBF16Insts def PK_ADD_MINMAX_Profile : VOP3P_Profile { diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 210e09fd9169a..7f6a920d25016 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_fdiv_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll new file mode 100644 index 0000000000000..1ba13b287be46 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16 + +; Make sure no "vgpr32 = copy vgpr16" is generated + +define amdgpu_kernel void @fma_mix_f16 (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) { + ; GFX11-REAL16-LABEL: name: fma_mix_f16 + ; GFX11-REAL16: bb.0.entry: + ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: {{ $}} + ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 + ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 + ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 + ; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 + ; GFX11-REAL16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1) + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1) + ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 14336, 0, implicit $exec + ; GFX11-REAL16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, killed [[V_MOV_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-REAL16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_ADD_F16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16 + ; GFX11-REAL16-NEXT: [[V_FMA_MIX_F16_t16_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMA_MIX_F16_t16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[REG_SEQUENCE4]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-REAL16-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIX_F16_t16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1) + ; GFX11-REAL16-NEXT: S_ENDPGM 0 + ; + ; GFX11-FAKE16-LABEL: name: fma_mix_f16 + ; GFX11-FAKE16: bb.0.entry: + ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 + ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 + ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 + ; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 + ; GFX11-FAKE16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1) + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1) + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 14336 + ; GFX11-FAKE16-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, killed [[GLOBAL_LOAD_USHORT_SADDR]], 0, killed [[S_MOV_B32_2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-FAKE16-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11-FAKE16-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[V_ADD_F16_fake16_e64_]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: GLOBAL_STORE_SHORT_SADDR killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIXLO_F16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1) + ; GFX11-FAKE16-NEXT: S_ENDPGM 0 +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid + %in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid + %in.gep3 = getelementptr i32, ptr addrspace(1) %c, i32 %tid + %load.a = load float, ptr addrspace(1) %in.gep1 + %load.b = load float, ptr addrspace(1) %in.gep2 + %load.c = load half, ptr addrspace(1) %in.gep3 + %add.c = fadd half %load.c, 0.5 + %load.float.c = fpext half %add.c to float + %result = tail call float @llvm.fmuladd.f32(float %load.a, float %load.b, float %load.float.c) + %half = fptrunc float %result to half + store half %half, ptr addrspace(1) %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index a859cc91b7fde..fe95d4561d0cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0x46000000 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v1 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0] -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0] -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: @@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v1, v1 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v0, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index c4a38dcd7b5f3..78a961ea0da17 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fast_frem_f16: @@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: fast_frem_f16: @@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: fast_frem_f16: @@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: unsafe_frem_f16: @@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: unsafe_frem_f16: @@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: unsafe_frem_f16: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 1ae3434db6da5..3f66c23e1a73b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -65,10 +65,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3c00 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 1.0, v0.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: @@ -137,13 +136,20 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GFX9: ; %bb.0: @@ -172,6 +178,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -196,10 +210,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: @@ -277,10 +289,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: @@ -499,14 +509,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GFX9: ; %bb.0: @@ -542,6 +563,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index eab92668c536b..21e6faf46f58d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -412,11 +412,9 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32: @@ -535,12 +533,10 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32: @@ -704,16 +700,13 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.h +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v6.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32: @@ -914,14 +907,23 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GFX900: ; %bb.0: @@ -978,6 +980,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1040,13 +1051,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0 -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1247,17 +1258,29 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GFX900: ; %bb.0: @@ -1358,6 +1381,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1452,10 +1487,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1618,9 +1653,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v0.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: @@ -2385,10 +2420,8 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { ; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index a4878539b1c74..95df131e21358 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -2253,9 +2253,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 79910af5c0434..93f4ea37117ba 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -929,9 +929,8 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { ; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16: