diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ff2595ef51869..2a36f3dea34ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -167,6 +167,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", "Has v_minimum3_f16 and v_maximum3_f16 instructions" >; +def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16", + "HasMin3Max3PKF16", + "true", + "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions" +>; + def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16", "HasMinimum3Maximum3PKF16", "true", @@ -2001,6 +2007,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureBF16ConversionInsts, FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, FeaturePermlane16Swap, @@ -2361,6 +2368,10 @@ def HasMinimum3Maximum3F16 : Predicate<"Subtarget->hasMinimum3Maximum3F16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; +def HasMin3Max3PKF16 : + Predicate<"Subtarget->hasMin3Max3PKF16()">, + AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>; + def HasMinimum3Maximum3PKF16 : Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5eddde1f72ec7..b22d421b425be 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -265,6 +265,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasIEEEMinimumMaximumInsts = false; bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; + bool HasMin3Max3PKF16 = false; bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; @@ -1388,6 +1389,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasMinimum3Maximum3F16; } + bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; } + bool hasTanhInsts() const { return HasTanhInsts; } bool hasAddPC64Inst() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 92a56a1d5f492..f1a8ee118356e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14068,7 +14068,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, case ISD::FMAXIMUMNUM: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: - return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); + return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) || + (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16()); case ISD::FMINIMUM: case ISD::FMAXIMUM: return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 23dfa4b07a0e4..ea14c77cdff0b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -144,10 +144,17 @@ def : VOP3PSatPat; def : VOP3PSatPat; } // End SubtargetPredicate = HasVOP3PInsts -let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { +let isCommutable = 1, FPDPRounding = 1 in { +let SubtargetPredicate = HasMin3Max3PKF16 in { +defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile, AMDGPUfmin3>; +defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile, AMDGPUfmax3>; +} + +let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile, AMDGPUfminimum3>; defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile, AMDGPUfmaximum3>; } +} // End isCommutable = 1, FPDPRounding = 1 // TODO: Make sure we're doing the right thing with denormals. Note // that FMA and MAD will differ. @@ -2237,6 +2244,8 @@ defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>; defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>; defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>; defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; +defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; +defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 16093f131a111..4827f752d9f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: @@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmax3_olt_0_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_mov_b32 s22, s10 +; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: s_mov_b32 s20, s6 +; GFX1250-NEXT: s_mov_b32 s21, s7 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmax3_olt_1_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_mov_b32 s22, s10 +; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: s_mov_b32 s20, s6 +; GFX1250-NEXT: s_mov_b32 s21, s7 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX12-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1 ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX12-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1 +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -850,6 +1032,15 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: no_fmax3_v2f16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max3_num_f16 v0, v2, v0, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 96956e2851b4a..6dfefd8a6052a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: @@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmin3_olt_0_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_mov_b32 s22, s10 +; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: s_mov_b32 s20, s6 +; GFX1250-NEXT: s_mov_b32 s21, s7 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmin3_olt_1_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_mov_b32 s22, s10 +; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: s_mov_b32 s20, s6 +; GFX1250-NEXT: s_mov_b32 s21, s7 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_min3_num_f32 v0, v2, v0, v1 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX12-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2 ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX12-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX12-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1 ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX12-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1 +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -850,6 +1032,15 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0 ; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: no_fmin3_v2f16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_min3_num_f16 v0, v2, v0, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -1023,6 +1214,40 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmin3_olt_0_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x1 +; GFX1250-NEXT: s_mov_b32 s12, s6 +; GFX1250-NEXT: s_mov_b32 s13, s7 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -1199,6 +1424,40 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_fmin3_olt_1_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s18, s10 +; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_mov_b32 s16, s4 +; GFX1250-NEXT: s_mov_b32 s17, s5 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x1 +; GFX1250-NEXT: s_mov_b32 s12, s6 +; GFX1250-NEXT: s_mov_b32 s13, s7 +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s index 0710344e3f057..88346941bb2cd 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s @@ -1169,3 +1169,147 @@ v_pk_maximum3_f16 v1, v4, v9, v16 v_pk_maximum3_f16 v1, v2, v5, 1.0 // GFX1250: v_pk_maximum3_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x37,0xcc,0x02,0x0b,0xca,0x1b] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v8, v1, s1, v4 clamp +// GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v1, v4, v9, v16 +// GFX1250: v_pk_min3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_min3_num_f16 v1, v2, v5, 1.0 +// GFX1250: v_pk_min3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v8, v1, s1, v4 clamp +// GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v1, v4, v9, v16 +// GFX1250: v_pk_max3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_pk_max3_num_f16 v1, v2, v5, 1.0 +// GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt index 64106de88260f..d3ef89957c255 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt @@ -815,3 +815,93 @@ # GFX1250: v_pk_maximum3_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x37,0xcc,0x01,0x03,0x10,0x04] 0x08,0x00,0x37,0xcc,0x01,0x03,0x10,0x04 + +# GFX1250: v_pk_min3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b] +0x01,0x40,0x38,0xcc,0x02,0x0b,0xca,0x1b + +# GFX1250: v_pk_min3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c] +0x01,0x40,0x38,0xcc,0x04,0x13,0x42,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0xc0,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x44,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x42,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x41,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c] +0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x9c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c] +0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x5c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c] +0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0x3c + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc] +0x08,0x40,0x38,0xcc,0x01,0x03,0x10,0xfc + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc] +0x08,0x47,0x38,0xcc,0x01,0x03,0x10,0xfc + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04] +0x08,0x60,0x38,0xcc,0x01,0x03,0x10,0x04 + +# GFX1250: v_pk_min3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04] +0x08,0x00,0x38,0xcc,0x01,0x03,0x10,0x04 + +# GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b] +0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b + +# GFX1250: v_pk_max3_num_f16 v1, v4, v9, v16 ; encoding: [0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c] +0x01,0x40,0x39,0xcc,0x04,0x13,0x42,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 clamp ; encoding: [0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0xc0,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x44,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x42,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x41,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c] +0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0x1c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c] +0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x9c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c] +0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x5c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c] +0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0x3c + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc] +0x08,0x40,0x39,0xcc,0x01,0x03,0x10,0xfc + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc] +0x08,0x47,0x39,0xcc,0x01,0x03,0x10,0xfc + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04] +0x08,0x60,0x39,0xcc,0x01,0x03,0x10,0x04 + +# GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04] +0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04