-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS #76485
Conversation
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesFull diff: https://github.com/llvm/llvm-project/pull/76485.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63bdf24d6b4f5e..35e54ebd5129f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0e4e6937bf44cd..e6340e773ea5d5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7160,10 +7160,6 @@ def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128rm addr:$src)>;
}
-let Predicates = [HasAVXNECONVERT, NoVLX] in
- def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
- (VBROADCASTF128rm addr:$src)>;
-
//===----------------------------------------------------------------------===//
// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
//
@@ -7931,6 +7927,14 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}
+let Predicates = [HasAVXNECONVERT, NoVLX] in {
+ def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF128rm addr:$src)>;
+ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16,
+ v16bf16, loadv8bf16, loadv16bf16>;
+ defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
+}
+
//===----------------------------------------------------------------------===//
// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
//
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 40b512d68be816..46fabb5efede68 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -372,31 +372,11 @@ entry:
;; FIXME: This should generate the same output as above, but let's fix the crash first.
define <16 x bfloat> @test_no_vbroadcast2() nounwind {
-; X86-LABEL: test_no_vbroadcast2:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebp # encoding: [0x55]
-; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT: subl $64, %esp # encoding: [0x83,0xec,0x40]
-; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X86-NEXT: vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X86-NEXT: vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT: popl %ebp # encoding: [0x5d]
-; X86-NEXT: retl # encoding: [0xc3]
-;
-; X64-LABEL: test_no_vbroadcast2:
-; X64: # %bb.0: # %entry
-; X64-NEXT: pushq %rbp # encoding: [0x55]
-; X64-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
-; X64-NEXT: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
-; X64-NEXT: subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
-; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X64-NEXT: vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X64-NEXT: vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X64-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
-; X64-NEXT: popq %rbp # encoding: [0x5d]
-; X64-NEXT: retq # encoding: [0xc3]
+; CHECK-LABEL: test_no_vbroadcast2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
entry:
%0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 72f3eacf87594c..0b823898c5c9ed 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
;
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
-; AVXNC-NEXT: pushq %rbp
-; AVXNC-NEXT: movq %rsp, %rbp
-; AVXNC-NEXT: andq $-32, %rsp
-; AVXNC-NEXT: subq $64, %rsp
-; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
-; AVXNC-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT: vmovaps %xmm0, (%rsp)
-; AVXNC-NEXT: vmovaps (%rsp), %ymm0
-; AVXNC-NEXT: movq %rbp, %rsp
-; AVXNC-NEXT: popq %rbp
+; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
+; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVXNC-NEXT: retq
%b = fptrunc <16 x float> %a to <16 x bfloat>
ret <16 x bfloat> %b
@@ -2485,3 +2478,17 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
%3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <32 x bfloat> %3
}
+
+define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_v8bf16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: concat_v8bf16:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x bfloat> %a
+}
|
llvm/lib/Target/X86/X86InstrSSE.td
Outdated
@@ -7931,6 +7927,14 @@ let Predicates = [HasAVX2, NoVLX] in { | |||
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; | |||
} | |||
|
|||
let Predicates = [HasAVXNECONVERT, NoVLX] in { | |||
def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file seems to have a good organization on categories of broadcast, insert and extract. Why put these three together?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In that way, we need to check Predicates each time. I'm fine with either way. I can change it if you prefer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I prefer to the old way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
(VBROADCASTF128rm addr:$src)>; | ||
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, | ||
v16bf16, loadv8bf16, loadv16bf16>; | ||
defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need to add tests for extract?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks @FreddyLeaf ! |
@@ -372,31 +372,11 @@ entry: | |||
|
|||
;; FIXME: This should generate the same output as above, but let's fix the crash first. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove FIXME?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch! Removed by 6c87f46
No description provided.