Skip to content
Permalink
Browse files
[AMDGPU] gfx90a support
Differential Revision: https://reviews.llvm.org/D96906
  • Loading branch information
rampitec committed Feb 18, 2021
1 parent 0252e6e commit a8d9d50762c42d726274d3f1126ec97ff96e2a22
Showing 359 changed files with 76,291 additions and 7,268 deletions.
@@ -2982,6 +2982,10 @@ Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only

Specify SRAM ECC mode (AMDGPU only)

.. option:: -mtgsplit, -mno-tgsplit

Enable threadgroup split execution mode (AMDGPU only)

.. option:: -mxnack, -mno-xnack

Specify XNACK mode (AMDGPU only)
@@ -258,5 +258,13 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x2bf16, "V4fV2sV2sV4fIiIiIi", "nc",
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts")

TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16_1k, "V32fV4sV4sV32fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts")

#undef BUILTIN
#undef TARGET_BUILTIN
@@ -78,6 +78,7 @@ enum class CudaArch {
GFX906,
GFX908,
GFX909,
GFX90a,
GFX90c,
GFX1010,
GFX1011,
@@ -3094,6 +3094,11 @@ def mcumode : Flag<["-"], "mcumode">, Group<m_amdgpu_Features_Group>,
HelpText<"Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)">;
def mno_cumode : Flag<["-"], "mno-cumode">, Group<m_amdgpu_Features_Group>;

def mtgsplit : Flag<["-"], "mtgsplit">, Group<m_amdgpu_Features_Group>,
HelpText<"Enable threadgroup split execution mode (AMDGPU only)">;
def mno_tgsplit : Flag<["-"], "mno-tgsplit">, Group<m_amdgpu_Features_Group>,
HelpText<"Disable threadgroup split execution mode (AMDGPU only)">;

def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">, Group<m_Group>,
HelpText<"Specify wavefront size 64 mode (AMDGPU only)">;
def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">, Group<m_Group>,
@@ -98,6 +98,7 @@ CudaArchToStringMap arch_names[] = {
GFX(906), // gfx906
GFX(908), // gfx908
GFX(909), // gfx909
GFX(90a), // gfx90a
GFX(90c), // gfx90c
GFX(1010), // gfx1010
GFX(1011), // gfx1011
@@ -212,6 +212,9 @@ bool AMDGPUTargetInfo::initFeatureMap(
Features["s-memrealtime"] = true;
Features["s-memtime-inst"] = true;
break;
case GK_GFX90A:
Features["gfx90a-insts"] = true;
LLVM_FALLTHROUGH;
case GK_GFX908:
Features["dot3-insts"] = true;
Features["dot4-insts"] = true;
@@ -202,6 +202,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case CudaArch::GFX906:
case CudaArch::GFX908:
case CudaArch::GFX909:
case CudaArch::GFX90a:
case CudaArch::GFX90c:
case CudaArch::GFX1010:
case CudaArch::GFX1011:
@@ -4634,6 +4634,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
case CudaArch::GFX906:
case CudaArch::GFX908:
case CudaArch::GFX909:
case CudaArch::GFX90a:
case CudaArch::GFX90c:
case CudaArch::GFX1010:
case CudaArch::GFX1011:
@@ -4703,6 +4704,7 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
case CudaArch::GFX906:
case CudaArch::GFX908:
case CudaArch::GFX909:
case CudaArch::GFX90a:
case CudaArch::GFX90c:
case CudaArch::GFX1010:
case CudaArch::GFX1011:
@@ -23,6 +23,7 @@
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
@@ -52,6 +53,7 @@
// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
// GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
@@ -1,5 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A

#pragma OPENCL EXTENSION cl_khr_fp64:enable

@@ -19,143 +20,199 @@ typedef short v32s __attribute__((ext_vector_type(32)));
typedef double v4d __attribute__((ext_vector_type(4)));


// CHECK-LABEL: @test_mfma_f32_32x32x1f32
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
#ifdef MFMA_GFX908_TESTS

// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x1f32
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_4x4x1f32
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_32x32x2f32
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x4f32
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_32x32x4f16
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x4f16
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_4x4x4f16
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_32x32x8f16
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x16f16
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_i32_32x32x4i8
// CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
{
*out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_i32_16x16x4i8
// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
{
*out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_i32_4x4x4i8
// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_i32_32x32x8i8
// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
{
*out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_i32_16x16x16i8
// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_32x32x2bf16
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x2bf16
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_4x4x2bf16
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_32x32x4bf16
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
}

// CHECK-LABEL: @test_mfma_f32_16x16x8bf16
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
}

#endif // MFMA_GFX908_TESTS

#ifdef MFMA_GFX90A_TESTS

// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
{
*out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
{
*out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
{
*out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
}

// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
{
*out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
}

#endif // MFMA_GFX90A_TESTS
@@ -22,6 +22,11 @@
// RUN: %clang -### -target amdgcn-amdhsa -mcpu=gfx908:sramecc- %s 2>&1 | FileCheck --check-prefix=NO-SRAM-ECC %s
// NO-SRAM-ECC: "-target-feature" "-sramecc"

// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mtgsplit %s 2>&1 | FileCheck --check-prefix=TGSPLIT %s
// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mno-tgsplit %s 2>&1 | FileCheck --check-prefix=NO-TGSPLIT %s
// TGSPLIT: "-target-feature" "+tgsplit"
// NO-TGSPLIT: "-target-feature" "-tgsplit"

// RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
// RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mno-wavefrontsize64 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
// WAVE64: "-target-feature" "+wavefrontsize64"
@@ -105,6 +105,7 @@
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx906
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx908
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011

0 comments on commit a8d9d50

Please sign in to comment.