-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions #151810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions #151810
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
dfe439a
to
dad0929
Compare
7bc4dd4
to
792b728
Compare
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
+// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
+// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
+// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
+// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
+// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
+// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
+// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
+// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
+// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
+// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
+// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -944,6 +980,12 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale);
}
// CHECK-LABEL: @test_sat_pk4_i4_i8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index af06fe7a09d7e..469bdb409aaff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -701,6 +701,12 @@ def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_fp6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ddb1e1081da8a..d11e5a3c4e3cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4633,6 +4633,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 350a31885e629..4698a5805ee0c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2973,6 +2973,9 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 100bb2de9abc7..f4b6af647ca1a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1820,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
} // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
} // End Constraints = "@earlyclobber $vdst"
let True16Predicate = UseRealTrue16Insts in {
@@ -2272,6 +2278,12 @@ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
new file mode 100644
index 0000000000000..c4395182d6719
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2...
[truncated]
|
@llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
+// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
+// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
+// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
+// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
+// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
+// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
+// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
+// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
+// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
+// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
+// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -944,6 +980,12 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale);
}
// CHECK-LABEL: @test_sat_pk4_i4_i8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index af06fe7a09d7e..469bdb409aaff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -701,6 +701,12 @@ def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_fp6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ddb1e1081da8a..d11e5a3c4e3cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4633,6 +4633,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 350a31885e629..4698a5805ee0c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2973,6 +2973,9 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 100bb2de9abc7..f4b6af647ca1a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1820,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
} // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
} // End Constraints = "@earlyclobber $vdst"
let True16Predicate = UseRealTrue16Insts in {
@@ -2272,6 +2278,12 @@ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
new file mode 100644
index 0000000000000..c4395182d6719
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2...
[truncated]
|
@llvm/pr-subscribers-mc Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
+// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
+// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
+// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
+// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
+// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
+// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
+// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
+// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
+// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
+// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
+// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -944,6 +980,12 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale);
}
// CHECK-LABEL: @test_sat_pk4_i4_i8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index af06fe7a09d7e..469bdb409aaff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -701,6 +701,12 @@ def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_fp6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ddb1e1081da8a..d11e5a3c4e3cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4633,6 +4633,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 350a31885e629..4698a5805ee0c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2973,6 +2973,9 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 100bb2de9abc7..f4b6af647ca1a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1820,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
} // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
} // End Constraints = "@earlyclobber $vdst"
let True16Predicate = UseRealTrue16Insts in {
@@ -2272,6 +2278,12 @@ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
new file mode 100644
index 0000000000000..c4395182d6719
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2...
[truncated]
|
dad0929
to
ff0a7f8
Compare
792b728
to
41253bc
Compare
ff0a7f8
to
e5c7f72
Compare
…sr_pk16__gfx1250_instructions
…sr_pk16__gfx1250_instructions
…sr_pk16__gfx1250_instructions
…sr_pk16__gfx1250_instructions
…sr_pk16__gfx1250_instructions
No description provided.