diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl index 7688dfa55a78e..1ada16610d0b3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -15,8 +15,8 @@ typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; // CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat> // CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat // CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]]) -// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) -// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) +// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], float %fC, i1 false) +// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], float %fC, i1 true) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true) // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 6795fb7aa0edb..0f29653f1f5be 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 : DefaultAttrsIntrinsic< [llvm_float_ty], // %r [ - llvm_v2i16_ty, // %a - llvm_v2i16_ty, // %b + llvm_v2bf16_ty, // %a + llvm_v2bf16_ty, // %b llvm_float_ty, // %c llvm_i1_ty // %clamp ], diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 140c99ff30414..cd14c12a8a80c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2494,7 +2494,7 @@ def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>; def VOP_BF16_V2BF16_V2BF16_BF16: VOPProfile <[bf16, v2bf16, v2bf16, bf16]>; -def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>; +def VOP_F32_V2BF16_V2BF16_F32 : VOPProfile <[f32, v2bf16, v2bf16, f32]>; def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 886858b5ab1ad..74f451b6d4f7f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -396,7 +396,7 @@ defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", } // End OtherPredicates = [HasDot1Insts] def DOT2_BF16_Profile - : VOP3P_Profile { + : VOP3P_Profile { let HasSrc1Mods = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 367ff57bae2fd..e51b1d2da2e41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 -declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float %c, i1 %clamp) +declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp) define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: @@ -25,10 +25,10 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ptr addrspace(1) %b, ptr addrspace(1) %c) { entry: - %a.val = load <2 x i16>, ptr addrspace(1) %a - %b.val = load <2 x i16>, ptr addrspace(1) %b + %a.val = load <2 x bfloat>, ptr addrspace(1) %a + %b.val = load <2 x bfloat>, ptr addrspace(1) %b %c.val = load float, ptr addrspace(1) %c - %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1) + %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 1) store float %r.val, ptr addrspace(1) %r ret void } @@ -55,10 +55,10 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ptr addrspace(1) %b, ptr addrspace(1) %c) { entry: - %a.val = load <2 x i16>, ptr addrspace(1) %a - %b.val = load <2 x i16>, ptr addrspace(1) %b + %a.val = load <2 x bfloat>, ptr addrspace(1) %a + %b.val = load <2 x bfloat>, ptr addrspace(1) %b %c.val = load float, ptr addrspace(1) %c - %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0) + %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 0) store float %r.val, ptr addrspace(1) %r ret void } diff --git a/llvm/test/MC/AMDGPU/bf16_imm.s b/llvm/test/MC/AMDGPU/bf16_imm.s index 271bd19aa4969..1bf76dfa17314 100644 --- a/llvm/test/MC/AMDGPU/bf16_imm.s +++ b/llvm/test/MC/AMDGPU/bf16_imm.s @@ -48,3 +48,66 @@ v_dot2_bf16_bf16 v2, v0, 0x3e22, v2 v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 // CHECK: v_dot2_bf16_bf16 v2, v0, v2, 0.15915494 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0x05,0xe2,0x03] + +v_dot2_f32_bf16 v2, v1, 0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.5, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -0.5, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 1.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -1.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 2.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -2.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 4.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -4.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.15915494, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0x3e22, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] + +v_dot2_f32_bf16 v2, 0.5, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -0.5, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 1.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -1.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 2.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -2.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 4.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -4.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 100.0, v1, v2 +// CHECK: v_dot2_f32_bf16 v2, 0x42c8, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00] + +v_dot2_f32_bf16 v2, v1, 100.0, v2 +// CHECK: v_dot2_f32_bf16 v2, v1, 0x42c8, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt b/llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt index 035e013e7772b..75a453f1c30ee 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt @@ -36,3 +36,66 @@ # CHECK: v_dot2_bf16_bf16 v2, v0, 0.15915494, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04] 0x02,0x00,0x67,0xd6,0x00,0xf1,0x09,0x04 + +# CHECK: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0x01,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xe1,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xe3,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xe5,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xe7,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xe9,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xeb,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xed,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xef,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c] +0x02,0x40,0x1a,0xcc,0x01,0xf1,0x09,0x1c + +# CHECK: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf0,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf1,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf2,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf3,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf4,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf5,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf6,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c] +0x02,0x40,0x1a,0xcc,0xf7,0x02,0x0a,0x1c + +# CHECK: v_dot2_f32_bf16 v2, 0x42c8, v1, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00] +0x02,0x40,0x1a,0xcc,0xff,0x02,0x0a,0x1c,0xc8,0x42,0x00,0x00 + +# CHECK: v_dot2_f32_bf16 v2, v1, 0x42c8, v2 ; encoding: [0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00] +0x02,0x40,0x1a,0xcc,0x01,0xff,0x09,0x1c,0xc8,0x42,0x00,0x00