diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index fb41347b13136..bfa9da5a95a9b 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -361,6 +361,45 @@ define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) { ret <2 x i64> %m } +; 'Quadratic Reciprocity' - and(mul(x,x),2) -> 0 + +define i64 @combine_mul_self_knownbits(i64 %x) { +; SSE-LABEL: combine_mul_self_knownbits: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: imull %eax, %eax +; SSE-NEXT: andl $2, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: combine_mul_self_knownbits: +; AVX: # %bb.0: +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: imull %eax, %eax +; AVX-NEXT: andl $2, %eax +; AVX-NEXT: retq + %1 = mul i64 %x, %x + %2 = and i64 %1, 2 + ret i64 %2 +} + +define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) { +; SSE-LABEL: combine_mul_self_knownbits_vector: +; SSE: # %bb.0: +; SSE-NEXT: pmulld %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_mul_self_knownbits_vector: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = mul <4 x i32> %x, %x + %2 = and <4 x i32> %1, + ret <4 x i32> %2 +} + ; This would infinite loop because DAGCombiner wants to turn this into a shift, ; but x86 lowering wants to avoid non-uniform vector shift amounts.