From f11526b091c489ed0b96538bc91a2e4dcfd9ed4f Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Sat, 22 Jul 2023 18:13:58 +0800 Subject: [PATCH] [X86][BF16] Do not scalarize masked load for BF16 when we have AVX512BF16 Fixes #63017 Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D155952 --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 11 +- llvm/test/CodeGen/X86/bfloat.ll | 578 ++++++++++++++++++ 2 files changed, 586 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 8bfbd27a5b900a..7dccb9161d5a77 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5839,6 +5839,9 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { if (ScalarTy->isHalfTy() && ST->hasBWI()) return true; + if (ScalarTy->isBFloatTy() && ST->hasBF16()) + return true; + if (!ScalarTy->isIntegerTy()) return false; @@ -6294,16 +6297,18 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( bool UseMaskForCond, bool UseMaskForGaps) { auto *VecTy = cast(BaseTy); - auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { + auto isSupportedOnAVX512 = [&](Type *VecTy) { Type *EltTy = cast(VecTy)->getElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) - return HasBW; + return ST->hasBWI(); + if (EltTy->isBFloatTy()) + return ST->hasBF16(); return false; }; - if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) + if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) return getInterleavedMemoryOpCostAVX512( Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 4caeaf381c874e..43213761bb5cd8 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -581,3 +581,581 @@ define <32 x bfloat> @pr63017() { ; BF16-NEXT: retq ret <32 x bfloat> zeroinitializer } + +define <32 x bfloat> @pr63017_2() nounwind { +; SSE2-LABEL: pr63017_2: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_1 +; SSE2-NEXT: # %bb.2: # %cond.load +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: jmp .LBB12_3 +; SSE2-NEXT: .LBB12_1: +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: .LBB12_3: # %else +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_5 +; SSE2-NEXT: # %bb.4: # %cond.load1 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: .LBB12_5: # %else2 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_6 +; SSE2-NEXT: # %bb.7: # %cond.load4 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm1, %xmm14 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: jmp .LBB12_8 +; SSE2-NEXT: .LBB12_6: +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa %xmm1, %xmm14 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: .LBB12_8: # %else5 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_10 +; SSE2-NEXT: # %bb.9: # %cond.load7 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_10: # %else8 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_12 +; SSE2-NEXT: # %bb.11: # %cond.load10 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_12: # %else11 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_14 +; SSE2-NEXT: # %bb.13: # %cond.load13 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_14: # %else14 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_16 +; SSE2-NEXT: # %bb.15: # %cond.load16 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_16: # %else17 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_18 +; SSE2-NEXT: # %bb.17: # %cond.load19 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_18: # %else20 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_20 +; SSE2-NEXT: # %bb.19: # %cond.load22 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_20: # %else23 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_22 +; SSE2-NEXT: # %bb.21: # %cond.load25 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_22: # %else26 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_24 +; SSE2-NEXT: # %bb.23: # %cond.load28 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_24: # %else29 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_26 +; SSE2-NEXT: # %bb.25: # %cond.load31 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_26: # %else32 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_28 +; SSE2-NEXT: # %bb.27: # %cond.load34 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_28: # %else35 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_30 +; SSE2-NEXT: # %bb.29: # %cond.load37 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_30: # %else38 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_32 +; SSE2-NEXT: # %bb.31: # %cond.load40 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_32: # %else41 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_34 +; SSE2-NEXT: # %bb.33: # %cond.load43 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_34: # %else44 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_36 +; SSE2-NEXT: # %bb.35: # %cond.load46 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_36: # %else47 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_38 +; SSE2-NEXT: # %bb.37: # %cond.load49 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB12_38: # %else50 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_40 +; SSE2-NEXT: # %bb.39: # %cond.load52 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: .LBB12_40: # %else53 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_42 +; SSE2-NEXT: # %bb.41: # %cond.load55 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: .LBB12_42: # %else56 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_44 +; SSE2-NEXT: # %bb.43: # %cond.load58 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: .LBB12_44: # %else59 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_46 +; SSE2-NEXT: # %bb.45: # %cond.load61 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: .LBB12_46: # %else62 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_48 +; SSE2-NEXT: # %bb.47: # %cond.load64 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: .LBB12_48: # %else65 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_50 +; SSE2-NEXT: # %bb.49: # %cond.load67 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: .LBB12_50: # %else68 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_52 +; SSE2-NEXT: # %bb.51: # %cond.load70 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: .LBB12_52: # %else71 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_54 +; SSE2-NEXT: # %bb.53: # %cond.load73 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: .LBB12_54: # %else74 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_56 +; SSE2-NEXT: # %bb.55: # %cond.load76 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: .LBB12_56: # %else77 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_58 +; SSE2-NEXT: # %bb.57: # %cond.load79 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: .LBB12_58: # %else80 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_60 +; SSE2-NEXT: # %bb.59: # %cond.load82 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: .LBB12_60: # %else83 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_62 +; SSE2-NEXT: # %bb.61: # %cond.load85 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: .LBB12_62: # %else86 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: jne .LBB12_64 +; SSE2-NEXT: # %bb.63: # %cond.load88 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: .LBB12_64: # %else89 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jne .LBB12_65 +; SSE2-NEXT: # %bb.66: # %cond.load91 +; SSE2-NEXT: movzwl (%rax), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: jmp .LBB12_67 +; SSE2-NEXT: .LBB12_65: +; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: .LBB12_67: # %else92 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: orl %ebx, %r14d +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: orl %ebx, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %r14, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: addq $200, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: retq +; +; BF16-LABEL: pr63017_2: +; BF16: # %bb.0: +; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; BF16-NEXT: retq + %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> ) + ret <32 x bfloat> %1 +} + +declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)