diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 66f5802a67465..593c7627a6575 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29629,9 +29629,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, } if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); - SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B); SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B); - SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo); + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A), + DAG.getBitcast(ExVT, B)); SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi); RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask); RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi, diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 1133cdfd083be..d21df472f06cb 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 -; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %x = mul <32 x i8> %i, %j ret <32 x i8> %x diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 29c41cac222b2..15d187a5baeec 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) { ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index 30f1874c51fed..638d88481f071 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm1 ; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: psllw $8, %xmm1 -; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq @@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 -; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3 +; GFNISSE-NEXT: pmullw %xmm2, %xmm3 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm4, %xmm3 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0 ; GFNISSE-NEXT: psllw $8, %xmm0 ; GFNISSE-NEXT: por %xmm3, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm3 -; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: pmullw %xmm1, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 ; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1 ; GFNISSE-NEXT: psllw $8, %xmm1 -; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: por %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: constant_shl_v32i8: @@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_shl_v32i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst) @@ -2521,9 +2520,9 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] @@ -2531,23 +2530,22 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-NEXT: psllw $8, %xmm0 ; GFNISSE-NEXT: por %xmm6, %xmm0 ; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1 ; GFNISSE-NEXT: psllw $8, %xmm1 ; GFNISSE-NEXT: por %xmm6, %xmm1 ; GFNISSE-NEXT: movdqa %xmm2, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2 ; GFNISSE-NEXT: psllw $8, %xmm2 ; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa %xmm3, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 -; GFNISSE-NEXT: pand %xmm5, %xmm6 +; GFNISSE-NEXT: pmullw %xmm3, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3 ; GFNISSE-NEXT: psllw $8, %xmm3 -; GFNISSE-NEXT: por %xmm6, %xmm3 +; GFNISSE-NEXT: por %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: constant_shl_v64i8: @@ -2559,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5 ; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2] -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 @@ -2572,8 +2570,8 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2 ; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 @@ -2581,9 +2579,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: constant_shl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] @@ -2591,7 +2589,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 -; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -2601,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512VL-LABEL: constant_shl_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1] @@ -2618,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX512BW-LABEL: constant_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index a798f4c38f68f..541ca9d4f4096 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2368,17 +2368,15 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm5 -; SSE41-NEXT: paddb %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_signed_reg_reg: @@ -2390,14 +2388,13 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2429,12 +2426,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2447,12 +2442,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2591,17 +2584,15 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psubb %xmm2, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 -; SSE41-NEXT: pand %xmm2, %xmm5 -; SSE41-NEXT: pandn %xmm4, %xmm2 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm4, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pandn %xmm4, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm5 -; SSE41-NEXT: paddb %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -2615,14 +2606,13 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2656,12 +2646,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2674,12 +2662,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2822,16 +2808,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2845,14 +2829,13 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2886,12 +2869,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2905,12 +2886,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3053,16 +3032,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3076,14 +3053,13 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3117,12 +3093,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3136,12 +3110,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3286,16 +3258,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3310,14 +3280,13 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3353,12 +3322,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3373,12 +3340,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 7c9adaf31aff5..85791cd65163a 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1896,40 +1896,38 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind { ; AVX1-LABEL: vec256_i8_signed_reg_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1943,14 +1941,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1974,15 +1971,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -1998,14 +1993,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2087,19 +2081,17 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 @@ -2119,14 +2111,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2150,15 +2141,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2175,14 +2164,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2247,41 +2235,39 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5 ; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6 ; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2296,14 +2282,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2328,15 +2313,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 @@ -2353,14 +2336,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -2443,19 +2425,17 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 @@ -2474,14 +2454,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2506,15 +2485,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; XOP-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 @@ -2531,14 +2508,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2603,44 +2579,42 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i8_signed_mem_mem: @@ -2654,14 +2628,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2687,15 +2660,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 @@ -2713,14 +2684,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 04f0a65c99da8..aa2dd00237b07 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -889,19 +889,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5) +; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-SKX-NOVBMI-NEXT: vzeroupper @@ -913,20 +911,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 -; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, (%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq ; @@ -936,19 +932,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 -; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 -; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5) +; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5) ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-AVX512-NEXT: vzeroupper @@ -960,20 +954,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 -; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 -; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 -; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4 +; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1 +; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1 +; CHECK-VBMI-NEXT: vmovdqa %ymm1, (%rdx) +; CHECK-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, ptr %a @@ -988,13 +980,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-NOVBMI: # %bb.0: ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-SKX-NOVBMI-NEXT: vzeroupper ; CHECK-SKX-NOVBMI-NEXT: retq @@ -1003,13 +994,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-SKX-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] -; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq @@ -1018,13 +1007,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq @@ -1033,13 +1021,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI: # %bb.0: ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 -; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] -; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 +; CHECK-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 00731fe3e9556..189c5aa9fee20 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -25,7 +25,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -160,16 +160,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm1, %xmm2 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v16i8: @@ -400,28 +398,27 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v32i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v32i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -430,7 +427,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -584,49 +581,44 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v32i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pmullw %xmm2, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pandn %xmm2, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm6, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm3, %xmm4 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: pandn %xmm3, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v32i8: @@ -773,9 +765,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v64i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] @@ -783,36 +775,35 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 -; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pmullw %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm3 ; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: por %xmm6, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8c: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -822,9 +813,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512F-LABEL: mul_v64i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0 @@ -837,7 +828,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) @@ -899,59 +890,52 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v64i8: ; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: pmullw %xmm4, %xmm9 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 -; SSE41-NEXT: pand %xmm8, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pandn %xmm4, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm10, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pmullw %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pandn %xmm5, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm9, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pmullw %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pandn %xmm6, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 -; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pmullw %xmm7, %xmm4 ; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pandn %xmm7, %xmm8 ; SSE41-NEXT: pmaddubsw %xmm8, %xmm3 ; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -959,33 +943,30 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 -; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpandn %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 -; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm1 +; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm5) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index 59b03f8c02223..c9e48f817fb44 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -58,13 +58,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: -; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX256BW-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2) +; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm3) ; AVX256BW-NEXT: retq ; ; AVX512BWVL-LABEL: test_mul_32i8: diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index bb7245c31b326..ec94d003f10ea 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2275,8 +2275,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221] +; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6 @@ -2302,8 +2302,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 ; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239] +; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 ; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239] ; CHECK-SSE41-NEXT: psllw $8, %xmm4 @@ -2341,7 +2341,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47] ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1] ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1] @@ -2361,7 +2361,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpaddb %xmm4, %xmm6, %xmm4 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm4, %xmm4 @@ -2375,7 +2375,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm8, %xmm8 @@ -2394,7 +2394,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 ; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm5, %xmm5 @@ -2423,7 +2423,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] ; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1] ; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] @@ -2443,7 +2443,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60,3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60,0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] ; CHECK-AVX2-NEXT: vpsllw $8, %ymm3, %ymm3 @@ -2458,7 +2458,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; ; CHECK-AVX512VL-LABEL: pr51133: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239,171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221] ; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221] ; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 ; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 33a6a7679bb9a..a5d6900f77f97 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2014,7 +2014,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2033,7 +2033,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 217431be10d88..0cffa1b78a654 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1631,9 +1631,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,32,8,2,128,2,8,32] -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1653,7 +1653,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1672,7 +1672,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1690,7 +1690,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX512VL-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | ymm1 | ymm2 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 3a522ccb6214a..25f8f94eb834c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -915,10 +915,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 @@ -957,10 +957,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index e68d1d792c90a..9b7d66def8b5b 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -731,7 +731,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 @@ -762,7 +762,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 7355f3683fc2e..fa5692aa9cef1 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -660,7 +660,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -686,7 +686,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 @@ -720,7 +720,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 5445330c82922..b11756a5e3b4e 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -544,7 +544,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -603,7 +603,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovb2m %zmm1, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 6cd5098504f91..ef255e598e4a1 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -840,7 +840,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 @@ -882,7 +882,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 98ea87cbe18f3..ca57359183312 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -702,7 +702,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 @@ -781,7 +781,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index a11fa370a86b7..b8a131e628007 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -575,7 +575,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -609,7 +609,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16] ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38] ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -648,7 +648,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index d0bb90c5fc8ab..6d6f1c28ca282 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -265,7 +265,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -275,7 +275,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1072,7 +1072,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -1095,7 +1095,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1103,7 +1103,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] -; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] ; X64-XOP-NEXT: retq ; @@ -1847,7 +1847,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -1857,7 +1857,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1865,7 +1865,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] -; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] ; X64-XOP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 3085c325e0968..37b96b8f3f927 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -1165,7 +1165,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1174,7 +1174,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index f9ccd1e8ca156..c7d2532e9acb2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1313,9 +1313,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1325,7 +1325,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -1366,7 +1366,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; ; AVX512DQVL-LABEL: constant_shift_v32i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQVL-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst) @@ -1388,9 +1388,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1400,7 +1400,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; X86-AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index 41238acc4b74d..1e5f1b8729d47 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -307,10 +307,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] @@ -324,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 3590c4d027be7..ac5830604461c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -100,16 +100,14 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm3, %xmm4 ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pmaddubsw %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pmaddubsw %xmm3, %xmm0 ; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: PR50049: @@ -129,21 +127,20 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR50049: