diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 59d17ec9fd654..49d6eb9732059 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23405,13 +23405,15 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, if (DAG.isSplatValue(Shuf->getOperand(0), DemandedElts, UndefElts)) { // Even if all demanded elements are splat, some of them could be undef. // Which lowest demanded element is *not* known-undef? - unsigned MinNonUndefIdx = ~0U; + std::optional MinNonUndefIdx; for (int Idx : Shuf->getMask()) { if (Idx < 0 || UndefElts[Idx]) continue; // Ignore sentinel indices, and undef elements. - MinNonUndefIdx = std::min(Idx, MinNonUndefIdx); + MinNonUndefIdx = std::min(Idx, MinNonUndefIdx.value_or(~0U)); } - assert(MinNonUndefIdx < NumElts && "Undef shuffle?"); + if (!MinNonUndefIdx) + return DAG.getUNDEF(VT); // All undef - result is undef. + assert(*MinNonUndefIdx < NumElts && "Expected valid element index."); SmallVector SplatMask(Shuf->getMask().begin(), Shuf->getMask().end()); for (int &Idx : SplatMask) { @@ -23419,7 +23421,7 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, continue; // Passthrough sentinel indices. // Otherwise, just pick the lowest demanded non-undef element. // Or sentinel undef, if we know we'd pick a known-undef element. - Idx = UndefElts[Idx] ? -1 : MinNonUndefIdx; + Idx = UndefElts[Idx] ? -1 : *MinNonUndefIdx; } assert(SplatMask != Shuf->getMask() && "Expected mask to change!"); return DAG.getVectorShuffle(VT, SDLoc(Shuf), Shuf->getOperand(0), diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index f71667d49080a..3da5fd3acc32b 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -6283,6 +6283,158 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ret void } +; From https://reviews.llvm.org/rGf8d9097168b7#1165311 +define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) #0 { +; SSE-LABEL: undefshuffle: +; SSE: ## %bb.0: +; SSE-NEXT: movb $1, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: testb %al, %al +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: testb %al, %al +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testb $1, %al +; SSE-NEXT: jne LBB32_1 +; SSE-NEXT: ## %bb.2: ## %else23 +; SSE-NEXT: testb $2, %al +; SSE-NEXT: jne LBB32_3 +; SSE-NEXT: LBB32_4: ## %else25 +; SSE-NEXT: testb $4, %al +; SSE-NEXT: jne LBB32_5 +; SSE-NEXT: LBB32_6: ## %else27 +; SSE-NEXT: testb $8, %al +; SSE-NEXT: jne LBB32_7 +; SSE-NEXT: LBB32_8: ## %else29 +; SSE-NEXT: testb $16, %al +; SSE-NEXT: jne LBB32_9 +; SSE-NEXT: LBB32_10: ## %else31 +; SSE-NEXT: testb $32, %al +; SSE-NEXT: jne LBB32_11 +; SSE-NEXT: LBB32_12: ## %else33 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: jne LBB32_13 +; SSE-NEXT: LBB32_14: ## %else35 +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: jne LBB32_15 +; SSE-NEXT: LBB32_16: ## %else37 +; SSE-NEXT: retq +; SSE-NEXT: LBB32_1: ## %cond.store +; SSE-NEXT: movl $0, (%rsi) +; SSE-NEXT: testb $2, %al +; SSE-NEXT: je LBB32_4 +; SSE-NEXT: LBB32_3: ## %cond.store24 +; SSE-NEXT: movl $0, 4(%rsi) +; SSE-NEXT: testb $4, %al +; SSE-NEXT: je LBB32_6 +; SSE-NEXT: LBB32_5: ## %cond.store26 +; SSE-NEXT: movl $0, 8(%rsi) +; SSE-NEXT: testb $8, %al +; SSE-NEXT: je LBB32_8 +; SSE-NEXT: LBB32_7: ## %cond.store28 +; SSE-NEXT: movl $0, 12(%rsi) +; SSE-NEXT: testb $16, %al +; SSE-NEXT: je LBB32_10 +; SSE-NEXT: LBB32_9: ## %cond.store30 +; SSE-NEXT: movl $0, 16(%rsi) +; SSE-NEXT: testb $32, %al +; SSE-NEXT: je LBB32_12 +; SSE-NEXT: LBB32_11: ## %cond.store32 +; SSE-NEXT: movl $0, 20(%rsi) +; SSE-NEXT: testb $64, %al +; SSE-NEXT: je LBB32_14 +; SSE-NEXT: LBB32_13: ## %cond.store34 +; SSE-NEXT: movl $0, 24(%rsi) +; SSE-NEXT: testb $-128, %al +; SSE-NEXT: je LBB32_16 +; SSE-NEXT: LBB32_15: ## %cond.store36 +; SSE-NEXT: movl $0, 28(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: undefshuffle: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: undefshuffle: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: undefshuffle: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: movb $15, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rsi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VLDQ-LABEL: undefshuffle: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: movb $15, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1} +; AVX512VLDQ-NEXT: vmovdqu32 %ymm1, (%rsi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: undefshuffle: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k0 +; AVX512VLBW-NEXT: movl $15, %eax +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kandd %k1, %k0, %k1 +; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vmovdqu32 %ymm0, (%rsi) {%k1} +; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: retq +; +; X86-AVX512-LABEL: undefshuffle: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 +; X86-AVX512-NEXT: vpmovw2m %xmm0, %k0 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl $15, %ecx +; X86-AVX512-NEXT: kmovd %ecx, %k1 +; X86-AVX512-NEXT: kandd %k1, %k0, %k1 +; X86-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovdqu32 %ymm0, (%eax) {%k1} +; X86-AVX512-NEXT: vzeroupper +; X86-AVX512-NEXT: retl + %i1 = shufflevector <8 x i1> %i0, <8 x i1> zeroinitializer, <16 x i32> + %i2 = shufflevector <16 x i1> %i1, <16 x i1> zeroinitializer, <32 x i32> + %it51 = and <32 x i1> %i2, + %i3 = shufflevector <32 x i1> %it51, <32 x i1> zeroinitializer, <8 x i32> + %i4 = shufflevector <32 x i1> %it51, <32 x i1> zeroinitializer, <8 x i32> + %i5 = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %src, i32 1, <8 x i1> %i4, <8 x i32> zeroinitializer) + tail call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr %dst, i32 1, <8 x i1> %i3) + ret void +} +declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr nocapture, i32 immarg, <8 x i1>, <8 x i32>) + declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>) declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)