[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485

phoebewang · 2023-12-28T03:50:13Z

No description provided.

llvmbot · 2023-12-28T03:50:40Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/76485.diff

4 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4)
(modified) llvm/lib/Target/X86/X86InstrSSE.td (+8-4)
(modified) llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll (+5-25)
(modified) llvm/test/CodeGen/X86/bfloat.ll (+17-10)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63bdf24d6b4f5e..35e54ebd5129f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FDIV, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
     }
     setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
     setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0e4e6937bf44cd..e6340e773ea5d5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7160,10 +7160,6 @@ def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128rm addr:$src)>;
 }
 
-let Predicates = [HasAVXNECONVERT, NoVLX] in
-  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
-            (VBROADCASTF128rm addr:$src)>;
-
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -7931,6 +7927,14 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in {
+  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
+            (VBROADCASTF128rm addr:$src)>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16,
+                          v16bf16, loadv8bf16,  loadv16bf16>;
+  defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 40b512d68be816..46fabb5efede68 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -372,31 +372,11 @@ entry:
 
 ;; FIXME: This should generate the same output as above, but let's fix the crash first.
 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
-; X86-LABEL: test_no_vbroadcast2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT:    subl $64, %esp # encoding: [0x83,0xec,0x40]
-; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X86-NEXT:    vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X86-NEXT:    vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_no_vbroadcast2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbp # encoding: [0x55]
-; X64-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
-; X64-NEXT:    andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
-; X64-NEXT:    subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
-; X64-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X64-NEXT:    vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X64-NEXT:    vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X64-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
-; X64-NEXT:    popq %rbp # encoding: [0x5d]
-; X64-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: test_no_vbroadcast2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 72f3eacf87594c..0b823898c5c9ed 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 ;
 ; AVXNC-LABEL: fptrunc_v16f32:
 ; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    pushq %rbp
-; AVXNC-NEXT:    movq %rsp, %rbp
-; AVXNC-NEXT:    andq $-32, %rsp
-; AVXNC-NEXT:    subq $64, %rsp
-; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
-; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
-; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
-; AVXNC-NEXT:    movq %rbp, %rsp
-; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    vinsertf128 $0, %xmm0, %ymm0, %ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVXNC-NEXT:    retq
   %b = fptrunc <16 x float> %a to <16 x bfloat>
   ret <16 x bfloat> %b
@@ -2485,3 +2478,17 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
   %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <32 x bfloat> %3
 }
+
+define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: concat_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x bfloat> %a
+}

FreddyLeaf · 2023-12-28T04:06:56Z

llvm/lib/Target/X86/X86InstrSSE.td

@@ -7931,6 +7927,14 @@ let Predicates = [HasAVX2, NoVLX] in {
  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }

+let Predicates = [HasAVXNECONVERT, NoVLX] in {
+  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),


This file seems to have a good organization on categories of broadcast, insert and extract. Why put these three together?

In that way, we need to check Predicates each time. I'm fine with either way. I can change it if you prefer.

Yes, I prefer to the old way.

FreddyLeaf · 2023-12-28T04:07:27Z

llvm/lib/Target/X86/X86InstrSSE.td

+            (VBROADCASTF128rm addr:$src)>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16,
+                          v16bf16, loadv8bf16,  loadv16bf16>;
+  defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;


do we need to add tests for extract?

FreddyLeaf

LGTM

phoebewang · 2023-12-28T05:28:52Z

Thanks @FreddyLeaf !

RKSimon · 2023-12-28T15:00:23Z

llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll

@@ -372,31 +372,11 @@ entry:

 ;; FIXME: This should generate the same output as above, but let's fix the crash first.


Remove FIXME?

Good catch! Removed by 6c87f46

Solved by #76485.

[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS

b1a13a3

phoebewang requested review from RKSimon and FreddyLeaf December 28, 2023 03:50

llvmbot added the backend:X86 label Dec 28, 2023

FreddyLeaf reviewed Dec 28, 2023

View reviewed changes

phoebewang added 2 commits December 28, 2023 13:06

Add extract tests

0227902

Keep the old format

94007ee

FreddyLeaf approved these changes Dec 28, 2023

View reviewed changes

phoebewang merged commit e499ae5 into llvm:main Dec 28, 2023
3 of 4 checks passed

phoebewang deleted the bf16 branch December 28, 2023 05:29

RKSimon reviewed Dec 28, 2023

View reviewed changes

phoebewang added a commit that referenced this pull request Dec 29, 2023

[X86][NFC] Remove meaningless FIXME

6c87f46

Solved by #76485.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485

[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485

phoebewang commented Dec 28, 2023

llvmbot commented Dec 28, 2023

FreddyLeaf Dec 28, 2023

phoebewang Dec 28, 2023

FreddyLeaf Dec 28, 2023

phoebewang Dec 28, 2023

FreddyLeaf Dec 28, 2023

phoebewang Dec 28, 2023

FreddyLeaf left a comment

phoebewang commented Dec 28, 2023

RKSimon Dec 28, 2023

phoebewang Dec 29, 2023

		@@ -372,31 +372,11 @@ entry:

		;; FIXME: This should generate the same output as above, but let's fix the crash first.

[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485

[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485

Conversation

phoebewang commented Dec 28, 2023

llvmbot commented Dec 28, 2023

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

FreddyLeaf left a comment

Choose a reason for hiding this comment

phoebewang commented Dec 28, 2023

Choose a reason for hiding this comment

Choose a reason for hiding this comment