[DAGCombiner] protect against an infinite loop between shl <--> mul (…

…PR35579) At first, I tried to thread the x86 needle and use a target hook (isVectorShiftByScalarCheap()) to disable the transform only for non-splat pow-of-2 constants, but not AVX2, but only some element types, but...it's difficult. Here we just avoid the loop with the x86 vector transform that conflicts with the general DAG combine and preserve all of the existing behavior AFAICT otherwise. Some tests that will probably fail if someone does try to restrict this in a more targeted way for x86-only may be found in: test/CodeGen/X86/combine-mul.ll test/CodeGen/X86/vector-mul.ll test/CodeGen/X86/widen_arith-5.ll This should prevent the infinite looping seen with: https://bugs.llvm.org/show_bug.cgi?id=35579 Differential Revision: https://reviews.llvm.org/D41040 llvm-svn: 320374
llvm · Dec 11, 2017 · f3436d7 · f3436d7
1 parent c07e6a0
commit f3436d7
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 1 deletion.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2675,7 +2675,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
   // fold (mul x, (1 << c)) -> x << c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
-      DAG.isKnownToBeAPowerOfTwo(N1)) {
+      DAG.isKnownToBeAPowerOfTwo(N1) &&
+      (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
     SDLoc DL(N);
     SDValue LogBase2 = BuildLogBase2(N1, DL);
     AddToWorklist(LogBase2.getNode());

diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -308,3 +308,37 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
   %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
   ret <4 x i32> %2
 }
+
+; This would infinite loop because DAGCombiner wants to turn this into a shift,
+; but x86 lowering wants to avoid non-uniform vector shift amounts.
+
+define <16 x i8> @PR35579(<16 x i8> %x) {
+; SSE-LABEL: PR35579:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovsxbw %xmm0, %xmm1
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR35579:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
+  ret <16 x i8> %r
+}
+