diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2c63d2e77e5e5..8f05eb10920c7 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -343,6 +343,7 @@ def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, def int_amdgcn_rsq_clamp : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used. def int_amdgcn_ldexp : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index a04eb98874b3c..c3d66f4582635 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -832,9 +832,20 @@ def : divergent_i64_BinOp ; // 16-Bit Operand Instructions //===----------------------------------------------------------------------===// +def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16 { + // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware + // encoding treats src1 as an f16 + let Src1RC32 = RegisterOperand; + let Src1DPP = VGPR_32_Lo128; + let Src1ModDPP = IntT16VRegInputMods; +} + let isReMaterializable = 1 in { let FPDPRounding = 1 in { -defm V_LDEXP_F16 : VOP2Inst_t16 <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; + let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in + defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; + let SubtargetPredicate = HasTrue16BitInsts in + defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>; } // End FPDPRounding = 1 // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index 7068f45590551..0d172792cf205 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -1,13 +1,79 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b) -; GCN-LABEL: {{^}}ldexp_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_I32:[0-9]+]] -; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]] -; GCN: buffer_store_short v[[R_F16]] define amdgpu_kernel void @ldexp_f16( +; VI-LABEL: ldexp_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: ldexp_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s12, s6 +; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ldexp_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -18,11 +84,61 @@ define amdgpu_kernel void @ldexp_f16( ret void } -; GCN-LABEL: {{^}}ldexp_f16_imm_a -; GCN: buffer_load_dword v[[B_I32:[0-9]+]] -; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]] -; GCN: buffer_store_short v[[R_F16]] define amdgpu_kernel void @ldexp_f16_imm_a( +; VI-LABEL: ldexp_f16_imm_a: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: ldexp_f16_imm_a: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ldexp_f16_imm_a: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, i32 addrspace(1)* %b) { %b.val = load i32, i32 addrspace(1)* %b @@ -31,11 +147,61 @@ define amdgpu_kernel void @ldexp_f16_imm_a( ret void } -; GCN-LABEL: {{^}}ldexp_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}} -; GCN: buffer_store_short v[[R_F16]] define amdgpu_kernel void @ldexp_f16_imm_b( +; VI-LABEL: ldexp_f16_imm_b: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ldexp_f16_e64 v0, v0, 2 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: ldexp_f16_imm_b: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ldexp_f16_e64 v0, v0, 2 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ldexp_f16_imm_b: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_ldexp_f16_e64 v0, v0, 2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) { %a.val = load half, half addrspace(1)* %a diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s index c2e6974c688ef..66cb4ca17faa8 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s @@ -73,6 +73,9 @@ v_fmac_f16_e32 v5, v1, v255 v_fmamk_f16_e32 v5, v1, 0xfe0b, v255 // GFX11: error: operands are not valid for this GPU or mode +v_ldexp_f16_e32 v5, v1, v255 +// GFX11: error: operands are not valid for this GPU or mode + v_max_f16_e32 v5, v1, v255 // GFX11: error: operands are not valid for this GPU or mode @@ -142,6 +145,9 @@ v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] // GFX11: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: error: operands are not valid for this GPU or mode + v_max_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] // GFX11: error: operands are not valid for this GPU or mode @@ -211,6 +217,9 @@ v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: error: operands are not valid for this GPU or mode + v_max_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s index c5f81d51daa6c..876b7b88f50b1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s @@ -55,6 +55,9 @@ v_add_f16 v5, v1, v255 v_fmac_f16 v5, v1, v255 // GFX11: v_fmac_f16_e64 +v_ldexp_f16 v5, v1, v255 +// GFX11: v_ldexp_f16_e64 + v_max_f16 v5, v1, v255 // GFX11: v_max_f16_e64 @@ -115,6 +118,9 @@ v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0] v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_add_f16_e64 +v_ldexp_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_ldexp_f16_e64 + v_max_f16 v5, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_max_f16_e64 @@ -175,6 +181,9 @@ v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_add_f16_e64 +v_ldexp_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_ldexp_f16_e64 + v_max_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_max_f16_e64