diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 7465f13d552d6..e562ef04a3019 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -410,6 +410,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") // GFX12+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_permlane16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index b8d281531e218..2899d9e5c2889 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -1,6 +1,54 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int uint; + +// CHECK-LABEL: @test_permlane16_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane16_var(global uint* out, uint a, uint b, uint c) { + *out = __builtin_amdgcn_permlane16_var(a, b, c, 0, 0); +} + +// CHECK-LABEL: @test_permlanex16_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlanex16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlanex16_var(global uint* out, uint a, uint b, uint c) { + *out = __builtin_amdgcn_permlanex16_var(a, b, c, 0, 0); +} + // CHECK-LABEL: @test_s_barrier_signal( // CHECK-NEXT: entry: // CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12-param.cl new file mode 100644 index 0000000000000..0e0ea3646a2f1 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12-param.cl @@ -0,0 +1,14 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1200 -verify -S -o - %s + +typedef unsigned int uint; + +void test_permlane16_var(global uint* out, uint a, uint b, uint c, uint d) { + *out = __builtin_amdgcn_permlane16_var(a, b, c, d, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_var' must be a constant integer}} + *out = __builtin_amdgcn_permlane16_var(a, b, c, 1, d); // expected-error{{argument to '__builtin_amdgcn_permlane16_var' must be a constant integer}} +} + +void test_permlanex16_var(global uint* out, uint a, uint b, uint c, uint d) { + *out = __builtin_amdgcn_permlanex16_var(a, b, c, d, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_var' must be a constant integer}} + *out = __builtin_amdgcn_permlanex16_var(a, b, c, 1, d); // expected-error{{argument to '__builtin_amdgcn_permlanex16_var' must be a constant integer}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12.cl new file mode 100644 index 0000000000000..34887a65021c3 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx12.cl @@ -0,0 +1,16 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu tahiti -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu hawaii -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu fiji -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx908 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1030 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1100 -verify -S -o - %s + +typedef unsigned int uint; + +void test(global uint* out, uint a, uint b, uint c) { + *out = __builtin_amdgcn_permlane16_var(a, b, c, 1, 1); // expected-error {{'__builtin_amdgcn_permlane16_var' needs target feature gfx12-insts}} + *out = __builtin_amdgcn_permlanex16_var(a, b, c, 1, 1); // expected-error {{'__builtin_amdgcn_permlanex16_var' needs target feature gfx12-insts}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 09e88152e65d2..cf054e89069d7 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2460,6 +2460,24 @@ def int_amdgcn_s_wait_event_export_ready : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] >; +//===----------------------------------------------------------------------===// +// GFX12 Intrinsics +//===----------------------------------------------------------------------===// + +// llvm.amdgcn.permlane16.var +def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, + ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + +// llvm.amdgcn.permlanex16.var +def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, + Intrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, + ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 5296415ab4c36..ee93d9eb4c0a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -992,14 +992,27 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); } case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { + case Intrinsic::amdgcn_permlane16_var: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlanex16_var: { // Discard vdst_in if it's not going to be read. Value *VDstIn = II.getArgOperand(0); if (isa(VDstIn)) break; - ConstantInt *FetchInvalid = cast(II.getArgOperand(4)); - ConstantInt *BoundCtrl = cast(II.getArgOperand(5)); + // FetchInvalid operand idx. + unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16) + ? 4 /* for permlane16 and permlanex16 */ + : 3; /* for permlane16_var and permlanex16_var */ + + // BoundCtrl operand idx. + // For permlane16 and permlanex16 it should be 5 + // For Permlane16_var and permlanex16_var it should be 4 + unsigned int BcIdx = FiIdx + 1; + + ConstantInt *FetchInvalid = cast(II.getArgOperand(FiIdx)); + ConstantInt *BoundCtrl = cast(II.getArgOperand(BcIdx)); if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 03b6d19b2b3c0..269c065fdf7ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4435,6 +4435,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_permlane16_var: + case Intrinsic::amdgcn_permlanex16_var: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + break; + } case Intrinsic::amdgcn_mfma_f32_4x4x1f32: case Intrinsic::amdgcn_mfma_f32_4x4x4f16: case Intrinsic::amdgcn_mfma_i32_4x4x4i8: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 317f3f21d2400..fc05f14744c0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -333,6 +333,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 3af71727c5b74..a7d8ff0242b80 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -163,7 +163,9 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, static bool isPermlane(const MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || - Opcode == AMDGPU::V_PERMLANEX16_B32_e64; + Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || + Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || + Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; } static bool isLdsDma(const MachineInstr &MI) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4edd7960bd8c4..0f92a56237acb 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -498,7 +498,9 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx11 || Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11 || Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx12 || - Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx12; + Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANE16_VAR_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } bool isGenericAtomic(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 685c9ac6a2be4..2733f1d5634d8 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -735,6 +735,15 @@ def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3 let HasExtDPP = 0; } +def VOP3_PERMLANE_VAR_Profile : VOP3_Profile, VOP3_OPSEL> { + let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, + IntOpSelMods:$src1_modifiers, VRegSrc_32:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel); + let HasClamp = 0; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + def opsel_i1timm : SDNodeXFormgetTargetConstant( N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE, @@ -751,6 +760,13 @@ class PermlanePat; +class PermlaneVarPat : GCNPat< + (permlane i32:$vdst_in, i32:$src0, i32:$src1, + timm:$fi, timm:$bc), + (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), + VGPR_32:$src1, VGPR_32:$vdst_in) +>; let SubtargetPredicate = isGFX10Plus in { let isCommutable = 1, isReMaterializable = 1 in { @@ -781,6 +797,17 @@ let SubtargetPredicate = isGFX10Plus in { } // End SubtargetPredicate = isGFX10Plus +let SubtargetPredicate = isGFX12Plus in { + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>; + defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>; + } // End $vdst = $vdst_in, DisableEncoding $vdst_in + + def : PermlaneVarPat; + def : PermlaneVarPat; + +} // End SubtargetPredicate = isGFX12Plus + class DivFmasPat : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), @@ -915,6 +942,9 @@ defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>; defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>; defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>; +defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; +defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; + //===----------------------------------------------------------------------===// // GFX11, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 1b3f7973c0d9e..8826263eabb69 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -21,6 +21,20 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i ret void } +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +define amdgpu_kernel void @v_permlane16_var_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + store i32 %v, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +define amdgpu_kernel void @v_permlanex16_var_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + store i32 %v, ptr addrspace(1) %out + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 define amdgpu_kernel void @update_dpp(ptr addrspace(1) %out, i32 %in1, i32 %in2) #0 { %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 @@ -98,6 +112,8 @@ bb: declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll new file mode 100644 index 0000000000000..131a3951b2bf2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -0,0 +1,896 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() + +define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %undef, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 +; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %undef, i32 %tidx, i32 %src1, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %undef, i32 %tidx, i32 %src1, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %undef, i32 %tidx, i32 %src1, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %undef, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 +; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %undef, i32 %tidx, i32 %src1, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %undef, i32 %tidx, i32 %src1, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %undef, i32 %tidx, i32 %src1, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll b/llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll new file mode 100644 index 0000000000000..09417136bc9d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/permlane16_var-op-sel.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1200 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx1200 -d - | FileCheck -check-prefix=OBJ %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefix=ASM %s + +declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) + +; OBJ-LABEL: : +; OBJ: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] + +; ASM-LABEL: permlane_op_sel: +; ASM: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; encoding: [0x00,0x08,0x0f,0xd7,0x00,0x03,0x02,0x00] +define amdgpu_kernel void @permlane_op_sel(ptr addrspace(1) %out, i32 %src0, i32 %src1) { + %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 1, i1 0) + store i32 %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane16var-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane16var-hazard.mir new file mode 100644 index 0000000000000..bf598153fc23c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane16var-hazard.mir @@ -0,0 +1,168 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: hazard_vcmpx_permlane16var +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: S_ADD_U32 +# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlane16var +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr2, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlanex16var +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANEX16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlanex16var +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr1 = V_PERMLANEX16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr2, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16var_v_nop +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: V_NOP +# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlane16var_v_nop +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + V_NOP_e32 implicit $exec + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr2, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16var_far +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlane16var_far +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + V_NOP_e32 implicit $exec + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr1, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16var_no_hazard +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: V_ADD_F32 +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlane16var_no_hazard +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr2, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_permlane16var_undef_src +# GCN: V_CMPX_LE_F32_nosdst_e32 +# GCN: S_ADD_U32 +# GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_permlane16var_undef_src +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, undef $vgpr1, 0, killed $vgpr2, undef $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_vcmpx_e64_permlane16var +# GCN: V_CMPX_LE_F32_nosdst_e64 +# GCN: S_ADD_U32 +# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GCN-NEXT: V_PERMLANE16_VAR_B32_e64 +--- +name: hazard_vcmpx_e64_permlane16var +body: | + bb.0: + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + V_CMPX_LE_F32_nosdst_e64 0, 0, 0, $vgpr0, 0, implicit-def $exec, implicit $mode, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $vgpr1 = V_PERMLANE16_VAR_B32_e64 0, killed $vgpr1, 0, killed $vgpr2, $vgpr1, 0, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index c6ec918147014..89078c1ad4e04 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -1585,6 +1585,12 @@ v_mov_b64_e64 v[10:11], v[2:3] v_mul_lo_i32 v0, v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_permlane16_var_b32 v0, v0, v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_permlanex16_var_b32 v0, v0, v1 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + v_pk_add_f32 v[10:11], v[2:3], v[4:5] // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index e97e4b8c7241c..fefb345c1288b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -5152,6 +5152,57 @@ v_permlanex16_b32 v5, v1, 0.5, null op_sel:[1,0] v_permlanex16_b32 v255, v255, src_scc, exec_hi op_sel:[0,1] // GFX12: encoding: [0xff,0x10,0x5c,0xd6,0xff,0xfb,0xfd,0x01] +v_permlane16_var_b32 v5, v1, v2 +// GFX12: encoding: [0x05,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00] + +v_permlane16_var_b32 v5, v1, v255 +// GFX12: encoding: [0x05,0x00,0x0f,0xd7,0x01,0xff,0x03,0x00] + +v_permlane16_var_b32 v5, v255, v0 +// GFX12: encoding: [0x05,0x00,0x0f,0xd7,0xff,0x01,0x02,0x00] + +v_permlane16_var_b32 v255, v1, v2 +// GFX12: encoding: [0xff,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,1] +// GFX12: encoding: [0x05,0x18,0x0f,0xd7,0x01,0x65,0x02,0x00] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[0,0] +// GFX12: encoding: [0x05,0x00,0x0f,0xd7,0x01,0x65,0x02,0x00] + +v_permlane16_var_b32 v5, v1, v50, op_sel:[1,0] +// GFX12: encoding: [0x05,0x08,0x0f,0xd7,0x01,0x65,0x02,0x00] + +v_permlane16_var_b32 v255, v255, v0, op_sel:[0,1] +// GFX12: encoding: [0xff,0x10,0x0f,0xd7,0xff,0x01,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v2 +// GFX12: encoding: [0x05,0x00,0x10,0xd7,0x01,0x05,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v105 +// GFX12: encoding: [0x05,0x00,0x10,0xd7,0x01,0xd3,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v255 +// GFX12: encoding: [0x05,0x00,0x10,0xd7,0x01,0xff,0x03,0x00] + +v_permlanex16_var_b32 v255, v1, v2 +// GFX12: encoding: [0xff,0x00,0x10,0xd7,0x01,0x05,0x02,0x00] + +v_permlanex16_var_b32 v1, v255, v2 +// GFX12: encoding: [0x01,0x00,0x10,0xd7,0xff,0x05,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,1] +// GFX12: encoding: [0x05,0x18,0x10,0xd7,0x01,0xc9,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[0,0] +// GFX12: encoding: [0x05,0x00,0x10,0xd7,0x01,0xc9,0x02,0x00] + +v_permlanex16_var_b32 v5, v1, v100, op_sel:[1,0] +// GFX12: encoding: [0x05,0x08,0x10,0xd7,0x01,0xc9,0x02,0x00] + +v_permlanex16_var_b32 v255, v255, v100, op_sel:[0,1] +// GFX12: encoding: [0xff,0x10,0x10,0xd7,0xff,0xc9,0x02,0x00] + v_qsad_pk_u16_u8 v[5:6], v[1:2], v2, ttmp[14:15] // GFX12: encoding: [0x05,0x00,0x3a,0xd6,0x01,0x05,0xea,0x01] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s index bd1ed4042fda9..d8236c302b6d0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s @@ -6,3 +6,98 @@ v_permlane16_b32 v5, v1, s2, s3 op_sel:[0, 0, 0, 1] v_permlanex16_b32 v5, v1, s2, s3 op_sel:[0, 0, 1, 0] // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand + +v_permlane16_var_b32 v5, v1, v2 clamp +// GFX12: error: invalid operand for instruction +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 clamp +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 div:2 +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 div:2 +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 mul:1 +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 mul:1 +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 -v5, v1, v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 -v5, v1, v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, -v1, v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, -v1, v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, -v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, -v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 -|v5|, v1, v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 -|v5|, v1, v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, -v1, |v2| op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, -v1, |v2| op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, -|v2| op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, -|v2| op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 |v5|, v1, v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 |v5|, v1, v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, |v1|, v2 op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, |v1|, v2 op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, |v2| op_sel:[0, 1] +// GFX12: error: not a valid operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, |v2| op_sel:[0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[-1, 0] +// GFX12: error: invalid op_sel value +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[-1, 0] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[1, -1] +// GFX12: error: invalid op_sel value +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[1, -1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 0, 1] +// GFX12: error: invalid op_sel operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 0, 1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 0, -1] +// GFX12: error: invalid op_sel value +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 0, -1] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1, 0] +// GFX12: error: invalid op_sel operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1, 0] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, -1, 0] +// GFX12: error: invalid op_sel value +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, -1, 0] +// GFX12-NEXT:{{^}} ^ + +v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1] +// GFX12: error: invalid op_sel operand +// GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1] +// GFX12-NEXT:{{^}} ^ diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 8ff8c2c4c4f6a..b277c276e6946 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -4774,6 +4774,57 @@ # GFX12: v_permlanex16_b32 v5, v1, s2, s3 op_sel:[0,1] ; encoding: [0x05,0x10,0x5c,0xd6,0x01,0x05,0x0c,0x00] 0x05,0x10,0x5c,0xd6,0x01,0x05,0x0c,0x00 +# GFX12: v_permlane16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00] +0x05,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x0f,0xd7,0x01,0xff,0x03,0x00] +0x05,0x00,0x0f,0xd7,0x01,0xff,0x03,0x00 + +# GFX12: v_permlane16_var_b32 v5, v255, v0 ; encoding: [0x05,0x00,0x0f,0xd7,0xff,0x01,0x02,0x00] +0x05,0x00,0x0f,0xd7,0xff,0x01,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00] +0xff,0x00,0x0f,0xd7,0x01,0x05,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,1] ; encoding: [0x05,0x18,0x0f,0xd7,0x01,0x65,0x02,0x00] +0x05,0x18,0x0f,0xd7,0x01,0x65,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v5, v1, v50 ; encoding: [0x05,0x00,0x0f,0xd7,0x01,0x65,0x02,0x00] +0x05,0x00,0x0f,0xd7,0x01,0x65,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v5, v1, v50 op_sel:[1,0] ; encoding: [0x05,0x08,0x0f,0xd7,0x01,0x65,0x02,0x00] +0x05,0x08,0x0f,0xd7,0x01,0x65,0x02,0x00 + +# GFX12: v_permlane16_var_b32 v255, v255, v0 op_sel:[0,1] ; encoding: [0xff,0x10,0x0f,0xd7,0xff,0x01,0x02,0x00] +0xff,0x10,0x0f,0xd7,0xff,0x01,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v2 ; encoding: [0x05,0x00,0x10,0xd7,0x01,0x05,0x02,0x00] +0x05,0x00,0x10,0xd7,0x01,0x05,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v105 ; encoding: [0x05,0x00,0x10,0xd7,0x01,0xd3,0x02,0x00] +0x05,0x00,0x10,0xd7,0x01,0xd3,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v255 ; encoding: [0x05,0x00,0x10,0xd7,0x01,0xff,0x03,0x00] +0x05,0x00,0x10,0xd7,0x01,0xff,0x03,0x00 + +# GFX12: v_permlanex16_var_b32 v255, v1, v2 ; encoding: [0xff,0x00,0x10,0xd7,0x01,0x05,0x02,0x00] +0xff,0x00,0x10,0xd7,0x01,0x05,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v1, v255, v2 ; encoding: [0x01,0x00,0x10,0xd7,0xff,0x05,0x02,0x00] +0x01,0x00,0x10,0xd7,0xff,0x05,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,1] ; encoding: [0x05,0x18,0x10,0xd7,0x01,0xc9,0x02,0x00] +0x05,0x18,0x10,0xd7,0x01,0xc9,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v100 ; encoding: [0x05,0x00,0x10,0xd7,0x01,0xc9,0x02,0x00] +0x05,0x00,0x10,0xd7,0x01,0xc9,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v5, v1, v100 op_sel:[1,0] ; encoding: [0x05,0x08,0x10,0xd7,0x01,0xc9,0x02,0x00] +0x05,0x08,0x10,0xd7,0x01,0xc9,0x02,0x00 + +# GFX12: v_permlanex16_var_b32 v255, v255, v100 op_sel:[0,1] ; encoding: [0xff,0x10,0x10,0xd7,0xff,0xc9,0x02,0x00] +0xff,0x10,0x10,0xd7,0xff,0xc9,0x02,0x00 + # GFX12: v_pipeflush ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00] 0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00