6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/masked_store_trunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE4-NEXT: pmovsxwq {{.*#+}} xmm7 = [255,255]
; SSE4-NEXT: pand %xmm7, %xmm3
; SSE4-NEXT: pand %xmm7, %xmm2
; SSE4-NEXT: packusdw %xmm3, %xmm2
Expand Down Expand Up @@ -2727,7 +2727,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE4-LABEL: truncstore_v16i32_v16i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm8, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm9 = [255,255,255,255]
; SSE4-NEXT: pand %xmm9, %xmm3
; SSE4-NEXT: pand %xmm9, %xmm2
; SSE4-NEXT: packusdw %xmm3, %xmm2
Expand Down Expand Up @@ -3720,7 +3720,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE4-LABEL: truncstore_v8i32_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255]
; SSE4-NEXT: pand %xmm5, %xmm1
; SSE4-NEXT: pand %xmm5, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
Expand Down
231 changes: 81 additions & 150 deletions llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2603,7 +2603,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE4-LABEL: truncstore_v16i32_v16i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm9, %xmm9
; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
; SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [65535,0,65535,0,65535,0,65535,0]
; SSE4-NEXT: pminud %xmm8, %xmm1
; SSE4-NEXT: pminud %xmm8, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
Expand Down Expand Up @@ -3293,7 +3293,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; SSE4-LABEL: truncstore_v16i32_v16i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm8, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm9 = [255,255,255,255]
; SSE4-NEXT: pminud %xmm9, %xmm1
; SSE4-NEXT: pminud %xmm9, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
Expand Down Expand Up @@ -3937,7 +3937,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE4-LABEL: truncstore_v8i32_v8i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
; SSE4-NEXT: pmovsxbw {{.*#+}} xmm5 = [65535,0,65535,0,65535,0,65535,0]
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm5, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
Expand Down Expand Up @@ -4327,7 +4327,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE4-LABEL: truncstore_v8i32_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255]
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm5, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -996,7 +996,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1170,7 +1170,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1206,7 +1206,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1360,7 +1360,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
Expand Down Expand Up @@ -1397,7 +1397,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
Expand Down Expand Up @@ -1550,7 +1550,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1587,7 +1587,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1743,7 +1743,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -1781,7 +1781,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
Expand Down
30 changes: 10 additions & 20 deletions llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -458,8 +457,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -571,8 +569,7 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
Expand Down Expand Up @@ -634,8 +631,7 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -745,8 +741,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -806,8 +801,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -917,8 +911,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -978,8 +971,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -1090,8 +1082,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down Expand Up @@ -1153,8 +1144,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/min-legal-vector-width.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1144,7 +1144,7 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-lega
; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
Expand Down Expand Up @@ -1199,7 +1199,7 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/movmsk-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1984,7 +1984,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
; KNL-LABEL: allones_v2i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1]
; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $3, %al
Expand Down Expand Up @@ -3185,7 +3185,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
; KNL-LABEL: allones_v2i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4]
; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4]
; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $3, %al
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/oddshuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1308,9 +1308,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
Expand All @@ -1327,12 +1327,12 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2
; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
; AVX2-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
Expand All @@ -1353,9 +1353,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/packus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
; SSE4: # %bb.0:
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1]
; SSE4-NEXT: pand %xmm2, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE4-NEXT: pand %xmm2, %xmm0
Expand Down
120 changes: 92 additions & 28 deletions llvm/test/CodeGen/X86/paddus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -993,12 +993,26 @@ define <16 x i16> @test27(<16 x i16> %x) {
}

define <16 x i16> @test28(<16 x i16> %x) {
; SSE-LABEL: test28:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE-NEXT: paddusw %xmm2, %xmm0
; SSE-NEXT: paddusw %xmm2, %xmm1
; SSE-NEXT: retq
; SSE2-LABEL: test28:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE2-NEXT: paddusw %xmm2, %xmm0
; SSE2-NEXT: paddusw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test28:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSSE3-NEXT: paddusw %xmm2, %xmm0
; SSSE3-NEXT: paddusw %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test28:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE41-NEXT: paddusw %xmm2, %xmm0
; SSE41-NEXT: paddusw %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test28:
; AVX1: # %bb.0:
Expand Down Expand Up @@ -1115,12 +1129,26 @@ define <16 x i16> @test29(<16 x i16> %x) {
}

define <16 x i16> @test30(<16 x i16> %x) {
; SSE-LABEL: test30:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; SSE-NEXT: paddusw %xmm2, %xmm0
; SSE-NEXT: paddusw %xmm2, %xmm1
; SSE-NEXT: retq
; SSE2-LABEL: test30:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; SSE2-NEXT: paddusw %xmm2, %xmm0
; SSE2-NEXT: paddusw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test30:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; SSSE3-NEXT: paddusw %xmm2, %xmm0
; SSSE3-NEXT: paddusw %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test30:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; SSE41-NEXT: paddusw %xmm2, %xmm0
; SSE41-NEXT: paddusw %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test30:
; AVX1: # %bb.0:
Expand Down Expand Up @@ -1294,14 +1322,32 @@ define <32 x i16> @test33(<32 x i16> %x) {
}

define <32 x i16> @test34(<32 x i16> %x) {
; SSE-LABEL: test34:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE-NEXT: paddusw %xmm4, %xmm0
; SSE-NEXT: paddusw %xmm4, %xmm1
; SSE-NEXT: paddusw %xmm4, %xmm2
; SSE-NEXT: paddusw %xmm4, %xmm3
; SSE-NEXT: retq
; SSE2-LABEL: test34:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE2-NEXT: paddusw %xmm4, %xmm0
; SSE2-NEXT: paddusw %xmm4, %xmm1
; SSE2-NEXT: paddusw %xmm4, %xmm2
; SSE2-NEXT: paddusw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test34:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSSE3-NEXT: paddusw %xmm4, %xmm0
; SSSE3-NEXT: paddusw %xmm4, %xmm1
; SSSE3-NEXT: paddusw %xmm4, %xmm2
; SSSE3-NEXT: paddusw %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test34:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
; SSE41-NEXT: paddusw %xmm4, %xmm0
; SSE41-NEXT: paddusw %xmm4, %xmm1
; SSE41-NEXT: paddusw %xmm4, %xmm2
; SSE41-NEXT: paddusw %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: test34:
; AVX1: # %bb.0:
Expand Down Expand Up @@ -1478,14 +1524,32 @@ define <32 x i16> @test35(<32 x i16> %x) {
}

define <32 x i16> @test36(<32 x i16> %x) {
; SSE-LABEL: test36:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2]
; SSE-NEXT: paddusw %xmm4, %xmm0
; SSE-NEXT: paddusw %xmm4, %xmm1
; SSE-NEXT: paddusw %xmm4, %xmm2
; SSE-NEXT: paddusw %xmm4, %xmm3
; SSE-NEXT: retq
; SSE2-LABEL: test36:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2]
; SSE2-NEXT: paddusw %xmm4, %xmm0
; SSE2-NEXT: paddusw %xmm4, %xmm1
; SSE2-NEXT: paddusw %xmm4, %xmm2
; SSE2-NEXT: paddusw %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test36:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2]
; SSSE3-NEXT: paddusw %xmm4, %xmm0
; SSSE3-NEXT: paddusw %xmm4, %xmm1
; SSSE3-NEXT: paddusw %xmm4, %xmm2
; SSSE3-NEXT: paddusw %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test36:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2]
; SSE41-NEXT: paddusw %xmm4, %xmm0
; SSE41-NEXT: paddusw %xmm4, %xmm1
; SSE41-NEXT: paddusw %xmm4, %xmm2
; SSE41-NEXT: paddusw %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: test36:
; AVX1: # %bb.0:
Expand Down
109 changes: 72 additions & 37 deletions llvm/test/CodeGen/X86/pmul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm3, %xmm0
Expand Down Expand Up @@ -109,20 +109,31 @@ entry:
}

define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
; SSE-LABEL: mul_v2i64c:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: psllq $32, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: mul_v2i64c:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: psllq $32, %xmm0
; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v2i64c:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pmuludq %xmm1, %xmm2
; SSE41-NEXT: psrlq $32, %xmm0
; SSE41-NEXT: pmuludq %xmm1, %xmm0
; SSE41-NEXT: psllq $32, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v2i64c:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117]
; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [117,117]
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
Expand Down Expand Up @@ -396,7 +407,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm4, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm5, %xmm0
Expand Down Expand Up @@ -452,12 +463,19 @@ entry:
}

define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind {
; SSE-LABEL: mul_v16i16c:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm2, %xmm1
; SSE-NEXT: retq
; SSE2-LABEL: mul_v16i16c:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v16i16c:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: pmullw %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v16i16c:
; AVX: # %bb.0: # %entry
Expand Down Expand Up @@ -488,7 +506,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind {
;
; SSE41-LABEL: mul_v8i32c:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [117,117,117,117]
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm2, %xmm1
; SSE41-NEXT: retq
Expand All @@ -504,22 +522,39 @@ entry:
}

define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind {
; SSE-LABEL: mul_v4i64c:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: psllq $32, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: pmuludq %xmm2, %xmm1
; SSE-NEXT: psllq $32, %xmm1
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: retq
; SSE2-LABEL: mul_v4i64c:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: psllq $32, %xmm0
; SSE2-NEXT: paddq %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: psrlq $32, %xmm1
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: psllq $32, %xmm1
; SSE2-NEXT: paddq %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i64c:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pmuludq %xmm2, %xmm3
; SSE41-NEXT: psrlq $32, %xmm0
; SSE41-NEXT: pmuludq %xmm2, %xmm0
; SSE41-NEXT: psllq $32, %xmm0
; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pmuludq %xmm2, %xmm3
; SSE41-NEXT: psrlq $32, %xmm1
; SSE41-NEXT: pmuludq %xmm2, %xmm1
; SSE41-NEXT: psllq $32, %xmm1
; SSE41-NEXT: paddq %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64c:
; AVX: # %bb.0: # %entry
Expand Down Expand Up @@ -764,7 +799,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm6, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm7, %xmm1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pmulh.ll
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
;
; SSE41-LABEL: and_mulhuw_v16i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767]
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/pr48215.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ define i32 @PR48215(i32 %a0, i32 %a1) {
; AVX1-NEXT: idivl %esi
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3]
; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,1,2,3]
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovmskps %ymm2, %ecx
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pr57340.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK-NEXT: vpbroadcastw (%rax), %xmm0
; CHECK-NEXT: vmovdqu (%rax), %ymm2
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1
; CHECK-NEXT: vpextrw $0, %xmm0, %eax
; CHECK-NEXT: movzwl %ax, %eax
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/pr61964.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
; AVX2-LABEL: splitTransposeDecode_8_avx2:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1
Expand All @@ -39,9 +39,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
;
; AVX512VL-LABEL: splitTransposeDecode_8_avx2:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27]
; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27]
; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31]
; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31]
; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm3
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: vmovdqa %ymm3, %ymm1
Expand All @@ -67,7 +67,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
; XOPAVX2-LABEL: splitTransposeDecode_8_avx2:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2
; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; XOPAVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/pr62014.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2
; SSE42-NEXT: movapd %xmm0, %xmm2
; SSE42-NEXT: movd %edi, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2]
; SSE42-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,2]
; SSE42-NEXT: pand %xmm3, %xmm0
; SSE42-NEXT: pcmpeqq %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
Expand All @@ -38,7 +38,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2]
; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,2]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
Expand Down Expand Up @@ -91,7 +91,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4
; SSE42-NEXT: movaps %xmm0, %xmm2
; SSE42-NEXT: movd %edi, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8]
; SSE42-NEXT: pand %xmm3, %xmm0
; SSE42-NEXT: pcmpeqd %xmm3, %xmm0
; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm1
Expand All @@ -103,7 +103,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8]
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
Expand Down Expand Up @@ -306,12 +306,12 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
; SSE42-NEXT: pand %xmm5, %xmm6
; SSE42-NEXT: pcmpeqw %xmm5, %xmm6
; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [1,2,4,8]
; SSE42-NEXT: pmovsxbd {{.*#+}} xmm7 = [1,2,4,8]
; SSE42-NEXT: movdqa %xmm5, %xmm0
; SSE42-NEXT: pand %xmm7, %xmm0
; SSE42-NEXT: pcmpeqd %xmm7, %xmm0
; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [16,32,64,128]
; SSE42-NEXT: pmovsxwd {{.*#+}} xmm0 = [16,32,64,128]
; SSE42-NEXT: pand %xmm0, %xmm5
; SSE42-NEXT: pcmpeqd %xmm0, %xmm5
; SSE42-NEXT: movdqa %xmm5, %xmm0
Expand All @@ -328,7 +328,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pr63507.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define <4 x i32> @PR63507() {
; CHECK-LABEL: PR63507:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4294967295,4294967295]
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
; CHECK-NEXT: vpmulld %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%psll.i = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pr74736.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ define void @main(<16 x i32> %0, i32 %1) {
; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1
; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7]
; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7]
; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2
; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/pr77459.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
Expand Down Expand Up @@ -157,7 +157,7 @@ define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
Expand All @@ -65,7 +65,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1
; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0
; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1
; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX256VLBW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX256VLBW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX256VLBW-NEXT: vpmovw2m %ymm2, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %xmm0
Expand All @@ -80,7 +80,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512VLBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512VLBW-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512VLBW-NEXT: vpmovm2b %k0, %xmm0
Expand All @@ -95,7 +95,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
Expand All @@ -111,7 +111,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
Expand Down Expand Up @@ -181,7 +181,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX512NOBW-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512NOBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512NOBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512NOBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; AVX512NOBW-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/psubus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test13:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -903,7 +903,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
; SSE41-NEXT: packssdw %xmm5, %xmm6
; SSE41-NEXT: packsswb %xmm7, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255]
; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: pand %xmm5, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
Expand Down Expand Up @@ -1047,7 +1047,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test15:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -1565,7 +1565,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_8i32_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -1871,7 +1871,7 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_16i32_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm6, %xmm3
; SSE41-NEXT: pminud %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
Expand Down Expand Up @@ -1975,7 +1975,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
;
; SSE41-LABEL: psubus_i16_i32_max_swapped:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -2070,7 +2070,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_i16_i32_min:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -2659,7 +2659,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
;
; SSE41-LABEL: test32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
; SSE41-NEXT: pminud %xmm3, %xmm2
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
Expand Down Expand Up @@ -2993,7 +2993,7 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; SSE41-LABEL: test34:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/sat-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
;
; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42]
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: pxor %xmm2, %xmm0
Expand Down Expand Up @@ -757,7 +757,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
;
; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42]
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/setcc-non-simple-type.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
; CHECK-AVX2-NEXT: movq 24(%rsi), %rcx
; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx
; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,1]
; CHECK-AVX2-NEXT: xorl %esi, %esi
; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,2]
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1]
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [2,2]
; CHECK-AVX2-NEXT: .p2align 4, 0x90
; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph
; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/sext-vsetcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
; AVX-LABEL: cmp_ult_load_const:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,214,0,255]
; AVX-NEXT: vpmovsxwd {{.*#+}} xmm1 = [42,214,0,255]
; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%loadx = load <4 x i8>, ptr %x
Expand Down Expand Up @@ -292,7 +292,7 @@ define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind {
; AVX-LABEL: cmp_slt_load_const:
; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,4294967254,0,4294967295]
; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [42,4294967254,0,4294967295]
; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%loadx = load <4 x i8>, ptr %x
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind {
; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512BWVL-FAST-ALL: # %bb.0:
; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi)
; AVX512BWVL-FAST-ALL-NEXT: vzeroupper
Expand Down
17 changes: 6 additions & 11 deletions llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -810,8 +810,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vzeroupper
Expand Down Expand Up @@ -882,8 +881,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vzeroupper
Expand Down Expand Up @@ -1307,16 +1305,15 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: negative:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Expand All @@ -1326,8 +1323,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX512F-LABEL: negative:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Expand All @@ -1346,8 +1342,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX512BW-LABEL: negative:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-FAST-ALL-NEXT: vzeroupper
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/X86/slow-pmulld.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM-LABEL: test_mul_v8i32_v8i8:
; SLM: # %bb.0:
; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
Expand All @@ -114,7 +114,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW-NEXT: pmaddwd %xmm2, %xmm0
; SLOW-NEXT: pmaddwd %xmm2, %xmm1
; SLOW-NEXT: ret{{[l|q]}}
Expand All @@ -124,7 +124,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-NEXT: ret{{[l|q]}}
Expand Down Expand Up @@ -199,7 +199,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM-LABEL: test_mul_v16i32_v16i8:
; SLM: # %bb.0:
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
Expand All @@ -221,7 +221,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
; SLOW-NEXT: pmaddwd %xmm4, %xmm0
; SLOW-NEXT: pmaddwd %xmm4, %xmm1
; SLOW-NEXT: pmaddwd %xmm4, %xmm2
Expand All @@ -237,7 +237,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
; SSE4-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-NEXT: pmaddwd %xmm4, %xmm2
Expand Down Expand Up @@ -399,7 +399,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
; SSE4-NEXT: pxor %xmm1, %xmm1
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SSE4-NEXT: pmulld %xmm1, %xmm2
; SSE4-NEXT: pmulld %xmm0, %xmm1
; SSE4-NEXT: movdqa %xmm2, %xmm0
Expand Down Expand Up @@ -480,7 +480,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SSE4-NEXT: pmulld %xmm3, %xmm0
; SSE4-NEXT: pmulld %xmm3, %xmm4
; SSE4-NEXT: pmulld %xmm3, %xmm2
Expand Down Expand Up @@ -621,7 +621,7 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLM-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM: # %bb.0:
; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
Expand All @@ -634,7 +634,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW-NEXT: pmaddwd %xmm2, %xmm0
; SLOW-NEXT: pmaddwd %xmm2, %xmm1
; SLOW-NEXT: ret{{[l|q]}}
Expand All @@ -644,7 +644,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-NEXT: ret{{[l|q]}}
Expand Down Expand Up @@ -719,7 +719,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLM-LABEL: test_mul_v16i32_v16i8_minsize:
; SLM: # %bb.0:
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
Expand All @@ -741,7 +741,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
; SLOW-NEXT: pmaddwd %xmm4, %xmm0
; SLOW-NEXT: pmaddwd %xmm4, %xmm1
; SLOW-NEXT: pmaddwd %xmm4, %xmm2
Expand All @@ -757,7 +757,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
; SSE4-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-NEXT: pmaddwd %xmm4, %xmm2
Expand Down Expand Up @@ -861,7 +861,7 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
; SLM-LABEL: test_mul_v8i32_v8i16_minsize:
; SLM: # %bb.0:
; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLM-NEXT: pxor %xmm3, %xmm3
; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
Expand All @@ -875,7 +875,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
; SLOW-NEXT: pxor %xmm1, %xmm1
; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SLOW-NEXT: pmulld %xmm1, %xmm2
; SLOW-NEXT: pmulld %xmm0, %xmm1
; SLOW-NEXT: movdqa %xmm2, %xmm0
Expand All @@ -886,7 +886,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
; SSE4-NEXT: pxor %xmm1, %xmm1
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; SSE4-NEXT: pmulld %xmm1, %xmm2
; SSE4-NEXT: pmulld %xmm0, %xmm1
; SSE4-NEXT: movdqa %xmm2, %xmm0
Expand All @@ -906,7 +906,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
; SLM-LABEL: test_mul_v16i32_v16i16_minsize:
; SLM: # %bb.0:
; SLM-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SLM-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SLM-NEXT: movdqa %xmm0, %xmm4
; SLM-NEXT: pxor %xmm5, %xmm5
; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
Expand All @@ -928,7 +928,7 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SLOW-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SLOW-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SLOW-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SLOW-NEXT: pmulld %xmm3, %xmm0
; SLOW-NEXT: pmulld %xmm3, %xmm4
; SLOW-NEXT: pmulld %xmm3, %xmm2
Expand All @@ -944,7 +944,7 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
; SSE4-NEXT: pmulld %xmm3, %xmm0
; SSE4-NEXT: pmulld %xmm3, %xmm4
; SSE4-NEXT: pmulld %xmm3, %xmm2
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE41-NEXT: subq %rax, %rdi
; SSE41-NEXT: movq %rdi, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967295,1,4294967295,1]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71
; SSE41-NEXT: movq %rcx, %rax
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down Expand Up @@ -1357,7 +1357,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/srem-vector-lkk.ll
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; SSE-NEXT: psrlw $15, %xmm2
; SSE-NEXT: psraw $6, %xmm1
; SSE-NEXT: paddw %xmm2, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmullw %xmm1, %xmm2
; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm1, %xmm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/sse-domains.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ define void @f(ptr nocapture %p, i32 %n) nounwind uwtable ssp {
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: addq $16, %rdi
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [127,127,127,127]
; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [127,127,127,127]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %while.body
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/subvector-broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
; X86-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
; X86-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4]
; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; X86-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6
Expand All @@ -827,7 +827,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
;
; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X86-AVX2: # %bb.0: # %entry
; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
; X86-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
Expand Down Expand Up @@ -856,7 +856,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
; X64-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4]
; X64-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4]
; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; X64-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6
Expand All @@ -877,7 +877,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
;
; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE41-NEXT: pinsrd $2, %edx, %xmm0
; SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: psrld $1, %xmm2
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down Expand Up @@ -915,7 +915,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,2,u]
; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down Expand Up @@ -1145,7 +1145,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down Expand Up @@ -1797,7 +1797,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind {
; CHECK-SSE41-LABEL: t2_narrow:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744073709507925,18446744073709507925]
; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/urem-vector-lkk.ll
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
; SSE-NEXT: psrlw $6, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmullw %xmm1, %xmm2
; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm1, %xmm0
Expand All @@ -139,7 +139,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; SSE-LABEL: dont_fold_urem_power_of_two:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63]
; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pextrw $1, %xmm0, %eax
; SSE-NEXT: andl $31, %eax
Expand Down
66 changes: 50 additions & 16 deletions llvm/test/CodeGen/X86/usub_sat_vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1093,22 +1093,56 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
}

define void @PR48223(ptr %p0) {
; SSE-LABEL: PR48223:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
; SSE-NEXT: psubusw %xmm4, %xmm1
; SSE-NEXT: psubusw %xmm4, %xmm0
; SSE-NEXT: psubusw %xmm4, %xmm3
; SSE-NEXT: psubusw %xmm4, %xmm2
; SSE-NEXT: movdqa %xmm2, 32(%rdi)
; SSE-NEXT: movdqa %xmm3, 48(%rdi)
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, 16(%rdi)
; SSE-NEXT: retq
; SSE2-LABEL: PR48223:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
; SSE2-NEXT: movdqa 48(%rdi), %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
; SSE2-NEXT: psubusw %xmm4, %xmm1
; SSE2-NEXT: psubusw %xmm4, %xmm0
; SSE2-NEXT: psubusw %xmm4, %xmm3
; SSE2-NEXT: psubusw %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR48223:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rdi), %xmm0
; SSSE3-NEXT: movdqa 16(%rdi), %xmm1
; SSSE3-NEXT: movdqa 32(%rdi), %xmm2
; SSSE3-NEXT: movdqa 48(%rdi), %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
; SSSE3-NEXT: psubusw %xmm4, %xmm1
; SSSE3-NEXT: psubusw %xmm4, %xmm0
; SSSE3-NEXT: psubusw %xmm4, %xmm3
; SSSE3-NEXT: psubusw %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR48223:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm0
; SSE41-NEXT: movdqa 16(%rdi), %xmm1
; SSE41-NEXT: movdqa 32(%rdi), %xmm2
; SSE41-NEXT: movdqa 48(%rdi), %xmm3
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
; SSE41-NEXT: psubusw %xmm4, %xmm1
; SSE41-NEXT: psubusw %xmm4, %xmm0
; SSE41-NEXT: psubusw %xmm4, %xmm3
; SSE41-NEXT: psubusw %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
; SSE41-NEXT: movdqa %xmm0, (%rdi)
; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR48223:
; AVX1: # %bb.0:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/var-permute-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1215,8 +1215,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [17,51,85,119]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136]
; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,51,85,119]
; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm0 = [34,68,102,136]
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: movq %rbp, %rsp
Expand All @@ -1238,7 +1238,7 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136]
; AVX512VL-NEXT: vpmovsxwq {{.*#+}} ymm0 = [34,68,102,136]
; AVX512VL-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1}
; AVX512VL-NEXT: movq %rbp, %rsp
; AVX512VL-NEXT: popq %rbp
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/vec_int_to_fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1907,7 +1907,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
;
; SSE41-LABEL: uitofp_2i64_to_4f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
Expand Down Expand Up @@ -2022,7 +2022,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
;
; SSE41-LABEL: uitofp_2i64_to_2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
Expand Down Expand Up @@ -2136,7 +2136,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
;
; SSE41-LABEL: uitofp_4i64_to_4f32_undef:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
Expand Down Expand Up @@ -2531,7 +2531,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: psrlq $1, %xmm3
Expand Down Expand Up @@ -4329,7 +4329,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rdi), %xmm1
; SSE41-NEXT: movdqa 16(%rdi), %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
Expand Down Expand Up @@ -4735,7 +4735,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
; SSE41-NEXT: movdqa 16(%rdi), %xmm5
; SSE41-NEXT: movdqa 32(%rdi), %xmm6
; SSE41-NEXT: movdqa 48(%rdi), %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1,1]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm7 = [1,1]
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
Expand Down Expand Up @@ -5625,7 +5625,7 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
;
; SSE41-LABEL: PR43609:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2]
; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [2,2]
; SSE41-NEXT: paddq %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/vec_setcc-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ define void @loop_no_const_reload(ptr %in, ptr %out, i32 %n) {
; SSE41-NEXT: je LBB0_3
; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: LBB0_2: ## %for.body
; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1
Expand Down Expand Up @@ -100,7 +100,7 @@ define void @loop_const_folding_underflow(ptr %in, ptr %out, i32 %n) {
; SSE41-NEXT: je LBB1_3
; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26]
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: LBB1_2: ## %for.body
Expand Down Expand Up @@ -219,7 +219,7 @@ define <4 x i1> @ugt_v4i32_splat(<4 x i32> %x) {
;
; SSE41-LABEL: ugt_v4i32_splat:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255]
; SSE41-NEXT: pmaxud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand Down Expand Up @@ -291,7 +291,7 @@ define <4 x i1> @uge_v4i32_splat(<4 x i32> %x) {
;
; SSE41-LABEL: uge_v4i32_splat:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
; SSE41-NEXT: pmaxud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand Down Expand Up @@ -364,7 +364,7 @@ define <4 x i1> @ult_v4i32_splat(<4 x i32> %x) {
;
; SSE41-LABEL: ult_v4i32_splat:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253]
; SSE41-NEXT: pminud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand Down Expand Up @@ -434,7 +434,7 @@ define <4 x i1> @ule_v4i32_splat(<4 x i32> %x) {
;
; SSE41-LABEL: ule_v4i32_splat:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254]
; SSE41-NEXT: pminud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand Down Expand Up @@ -501,7 +501,7 @@ define <4 x i1> @ugt_v4i32_nonsplat(<4 x i32> %x) {
;
; SSE41-LABEL: ugt_v4i32_nonsplat:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257]
; SSE41-NEXT: pmaxud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand All @@ -520,7 +520,7 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) {
;
; SSE41-LABEL: ugt_v4i32_splat_commute:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3,3,3,3]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,3,3,3]
; SSE41-NEXT: pminud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
Expand All @@ -544,7 +544,7 @@ define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
; SSE41-LABEL: PR39859:
; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43]
; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43]
; SSE41-NEXT: pmaxuw %xmm2, %xmm0
; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
Expand Down
62 changes: 42 additions & 20 deletions llvm/test/CodeGen/X86/vec_setcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -163,16 +163,27 @@ define <16 x i8> @or_icmp_eq_const_1bit_diff(<16 x i8> %x) {
}

define <4 x i32> @or_icmp_ne_const_1bit_diff(<4 x i32> %x) {
; SSE-LABEL: or_icmp_ne_const_1bit_diff:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: or_icmp_ne_const_1bit_diff:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: or_icmp_ne_const_1bit_diff:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,44,60]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: or_icmp_ne_const_1bit_diff:
; AVX: # %bb.0:
Expand Down Expand Up @@ -215,16 +226,27 @@ define <16 x i8> @and_icmp_eq_const_1bit_diff(<16 x i8> %x) {
}

define <4 x i32> @and_icmp_ne_const_1bit_diff(<4 x i32> %x) {
; SSE-LABEL: and_icmp_ne_const_1bit_diff:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44]
; SSE-NEXT: pcmpeqd %xmm0, %xmm1
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: and_icmp_ne_const_1bit_diff:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_icmp_ne_const_1bit_diff:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,54,44]
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: and_icmp_ne_const_1bit_diff:
; AVX: # %bb.0:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vec_shift6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ define <8 x i32> @test6(<8 x i32> %a) {
;
; SSE41-LABEL: test6:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8]
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm2, %xmm1
; SSE41-NEXT: retq
Expand Down Expand Up @@ -218,7 +218,7 @@ define <16 x i32> @test8(<16 x i32> %a) {
;
; SSE41-LABEL: test8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8]
; SSE41-NEXT: pmulld %xmm4, %xmm0
; SSE41-NEXT: pmulld %xmm4, %xmm1
; SSE41-NEXT: pmulld %xmm4, %xmm2
Expand Down Expand Up @@ -274,7 +274,7 @@ define <8 x i64> @test9(<8 x i64> %a) {
;
; AVX2-LABEL: test9:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3]
; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vec_smulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
Expand Down Expand Up @@ -401,7 +401,7 @@ define <4 x i32> @smulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
Expand Down Expand Up @@ -672,7 +672,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
Expand Down Expand Up @@ -871,7 +871,7 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0
Expand Down Expand Up @@ -1229,7 +1229,7 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512-NEXT: vpmuldq %zmm3, %zmm4, %zmm3
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0
Expand Down Expand Up @@ -3191,7 +3191,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vec_umulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
Expand Down Expand Up @@ -342,7 +342,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
Expand Down Expand Up @@ -562,7 +562,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
Expand Down Expand Up @@ -732,7 +732,7 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1
Expand Down Expand Up @@ -1024,7 +1024,7 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1
Expand Down Expand Up @@ -2820,7 +2820,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7]
; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0
Expand Down
Loading