Skip to content

Commit

Permalink
[X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS (#76485)
Browse files Browse the repository at this point in the history
  • Loading branch information
phoebewang committed Dec 28, 2023
1 parent 13cdee9 commit e499ae5
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 35 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
Expand All @@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
}

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -7909,6 +7909,9 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
}

let Predicates = [HasAVXNECONVERT, NoVLX] in
defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>;

//===----------------------------------------------------------------------===//
// VEXTRACTI128 - Extract packed integer values
//
Expand All @@ -7931,6 +7934,9 @@ let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}

let Predicates = [HasAVXNECONVERT, NoVLX] in
defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;

//===----------------------------------------------------------------------===//
// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
//
Expand Down
30 changes: 5 additions & 25 deletions llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -372,31 +372,11 @@ entry:

;; FIXME: This should generate the same output as above, but let's fix the crash first.
define <16 x bfloat> @test_no_vbroadcast2() nounwind {
; X86-LABEL: test_no_vbroadcast2:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp # encoding: [0x55]
; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
; X86-NEXT: subl $64, %esp # encoding: [0x83,0xec,0x40]
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
; X86-NEXT: vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
; X86-NEXT: vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
; X86-NEXT: popl %ebp # encoding: [0x5d]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_no_vbroadcast2:
; X64: # %bb.0: # %entry
; X64-NEXT: pushq %rbp # encoding: [0x55]
; X64-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
; X64-NEXT: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
; X64-NEXT: subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
; X64-NEXT: vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
; X64-NEXT: vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
; X64-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
; X64-NEXT: popq %rbp # encoding: [0x5d]
; X64-NEXT: retq # encoding: [0xc3]
; CHECK-LABEL: test_no_vbroadcast2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
entry:
%0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
Expand Down
64 changes: 54 additions & 10 deletions llvm/test/CodeGen/X86/bfloat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
;
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: pushq %rbp
; AVXNC-NEXT: movq %rsp, %rbp
; AVXNC-NEXT: andq $-32, %rsp
; AVXNC-NEXT: subq $64, %rsp
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
; AVXNC-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: vmovaps %xmm0, (%rsp)
; AVXNC-NEXT: vmovaps (%rsp), %ymm0
; AVXNC-NEXT: movq %rbp, %rsp
; AVXNC-NEXT: popq %rbp
; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVXNC-NEXT: retq
%b = fptrunc <16 x float> %a to <16 x bfloat>
ret <16 x bfloat> %b
Expand Down Expand Up @@ -2485,3 +2478,54 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
%3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <32 x bfloat> %3
}

define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; SSE2-LABEL: concat_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; AVX-LABEL: concat_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x bfloat> %a
}

define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
; SSE2-LABEL: extract_v32bf16_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pextrw $2, %xmm1, %eax
; SSE2-NEXT: pextrw $3, %xmm1, %edx
; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %eax, %edx
; SSE2-NEXT: shlq $32, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: pextrw $4, %xmm1, %eax
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: pextrw $6, %xmm1, %eax
; SSE2-NEXT: pextrw $7, %xmm1, %esi
; SSE2-NEXT: shll $16, %esi
; SSE2-NEXT: orl %eax, %esi
; SSE2-NEXT: shlq $32, %rsi
; SSE2-NEXT: orq %rcx, %rsi
; SSE2-NEXT: movq %rsi, %xmm1
; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; AVX-LABEL: extract_v32bf16_v8bf16:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x bfloat> %a
}

0 comments on commit e499ae5

Please sign in to comment.