diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 63bdf24d6b4f5..35e54ebd5129f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom); addLegalFPImmediate(APFloat::getZero(APFloat::BFloat())); @@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 0e4e6937bf44c..b61a694630d19 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7909,6 +7909,9 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; } +let Predicates = [HasAVXNECONVERT, NoVLX] in + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>; + //===----------------------------------------------------------------------===// // VEXTRACTI128 - Extract packed integer values // @@ -7931,6 +7934,9 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; } +let Predicates = [HasAVXNECONVERT, NoVLX] in + defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>; + //===----------------------------------------------------------------------===// // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores // diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll index 40b512d68be81..46fabb5efede6 100644 --- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll @@ -372,31 +372,11 @@ entry: ;; FIXME: This should generate the same output as above, but let's fix the crash first. define <16 x bfloat> @test_no_vbroadcast2() nounwind { -; X86-LABEL: test_no_vbroadcast2: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp # encoding: [0x55] -; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5] -; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0] -; X86-NEXT: subl $64, %esp # encoding: [0x83,0xec,0x40] -; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] -; X86-NEXT: vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24] -; X86-NEXT: vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24] -; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec] -; X86-NEXT: popl %ebp # encoding: [0x5d] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_no_vbroadcast2: -; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rbp # encoding: [0x55] -; X64-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5] -; X64-NEXT: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0] -; X64-NEXT: subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40] -; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] -; X64-NEXT: vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24] -; X64-NEXT: vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24] -; X64-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec] -; X64-NEXT: popq %rbp # encoding: [0x5d] -; X64-NEXT: retq # encoding: [0xc3] +; CHECK-LABEL: test_no_vbroadcast2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> ) %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 72f3eacf87594..674a0eacb0ca9 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; ; AVXNC-LABEL: fptrunc_v16f32: ; AVXNC: # %bb.0: -; AVXNC-NEXT: pushq %rbp -; AVXNC-NEXT: movq %rsp, %rbp -; AVXNC-NEXT: andq $-32, %rsp -; AVXNC-NEXT: subq $64, %rsp -; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1 -; AVXNC-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0 -; AVXNC-NEXT: vmovaps %xmm0, (%rsp) -; AVXNC-NEXT: vmovaps (%rsp), %ymm0 -; AVXNC-NEXT: movq %rbp, %rsp -; AVXNC-NEXT: popq %rbp +; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0 +; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1 +; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVXNC-NEXT: retq %b = fptrunc <16 x float> %a to <16 x bfloat> ret <16 x bfloat> %b @@ -2485,3 +2478,54 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) { %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> ret <32 x bfloat> %3 } + +define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { +; SSE2-LABEL: concat_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; AVX-LABEL: concat_v8bf16: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> + ret <16 x bfloat> %a +} + +define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) { +; SSE2-LABEL: extract_v32bf16_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $0, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: orl %eax, %edx +; SSE2-NEXT: shlq $32, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %esi +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: orl %eax, %esi +; SSE2-NEXT: shlq $32, %rsi +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; AVX-LABEL: extract_v32bf16_v8bf16: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> + ret <8 x bfloat> %a +}