-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] Avoid extra (PMADDUBSW(X,AND(Y)) in <X x i8> multiplication #168262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
On SSSE3 targets we use PMADDUBSW of odd/even with suitable masking to avoid having to extend to <X x i16> types, along with additional Port0/5 pressure. However, lower i8 elements in the pair can safely use PMULLW directly without any pre-masking as we will only use the lower i8 bits of the result which is only affected by the lower i8 of the inputs.
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesOn SSSE3 targets we use PMADDUBSW of odd/even with suitable masking to avoid having to extend to <X x i16> types, along with additional Port0/5 pressure. However, lower i8 elements in the pair can safely use PMULLW directly without any pre-masking as we will only use the lower i8 bits of the result which is only affected by the lower i8 of the inputs. Patch is 138.97 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168262.diff 24 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 66f5802a67465..593c7627a6575 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29629,9 +29629,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
- SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
- SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A),
+ DAG.getBitcast(ExVT, B));
SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 1133cdfd083be..d21df472f06cb 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: mul_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
-; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%x = mul <32 x i8> %i, %j
ret <32 x i8> %x
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 29c41cac222b2..15d187a5baeec 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) {
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE-NEXT: psllw $8, %xmm1
-; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index 30f1874c51fed..638d88481f071 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
@@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
@@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
+; GFNISSE-NEXT: pmullw %xmm2, %xmm3
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm4, %xmm3
; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm3, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
-; GFNISSE-NEXT: pand %xmm4, %xmm3
+; GFNISSE-NEXT: pmullw %xmm1, %xmm2
+; GFNISSE-NEXT: pand %xmm4, %xmm2
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: por %xmm3, %xmm1
+; GFNISSE-NEXT: por %xmm2, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v32i8:
@@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: constant_shl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -2521,9 +2520,9 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v64i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2531,23 +2530,22 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm6, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
; GFNISSE-NEXT: por %xmm6, %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2
; GFNISSE-NEXT: psllw $8, %xmm2
; GFNISSE-NEXT: por %xmm6, %xmm2
-; GFNISSE-NEXT: movdqa %xmm3, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
-; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmullw %xmm3, %xmm4
+; GFNISSE-NEXT: pand %xmm5, %xmm4
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3
; GFNISSE-NEXT: psllw $8, %xmm3
-; GFNISSE-NEXT: por %xmm6, %xmm3
+; GFNISSE-NEXT: por %xmm4, %xmm3
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v64i8:
@@ -2559,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5
; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
@@ -2572,8 +2570,8 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2
; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
@@ -2581,9 +2579,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX2-LABEL: constant_shl_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2591,7 +2589,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -2601,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX512VL-LABEL: constant_shl_v64i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
@@ -2618,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX512BW-LABEL: constant_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index a798f4c38f68f..541ca9d4f4096 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2368,17 +2368,15 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; SSE41-NEXT: psubb %xmm3, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_signed_reg_reg:
@@ -2390,14 +2388,13 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -2429,12 +2426,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2447,12 +2442,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -2591,17 +2584,15 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; SSE41-NEXT: psubb %xmm2, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm5
-; SSE41-NEXT: pand %xmm2, %xmm5
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm4, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_unsigned_reg_reg:
@@ -2615,14 +2606,13 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
...
[truncated]
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/14015 Here is the relevant piece of the build log for the reference |
On SSSE3 targets we use PMADDUBSW of odd/even with suitable masking to avoid having to extend/truncate with
<X x i16>types and avoid additional Port0/5 pressure.However, lower i8 elements in the pair can safely use PMULLW directly without any pre-masking as we will only use the lower i8 bits of the result which is only affected by the lower i8 of the inputs.