diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 5021a25cb8a97..b151a7d567489 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -920,6 +920,359 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ret <16 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE-LABEL: splatvar_modulo_shift_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE-NEXT: psrlq %xmm1, %xmm2 +; SSE-NEXT: psrlq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v2i64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-SSE-NEXT: psrlq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm0 +; X86-SSE-NEXT: psubq %xmm2, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <2 x i64> %b, + %splat = shufflevector <2 x i64> %mod, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = ashr <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: andl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psrad %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrad %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v4i32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: andl $31, %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: psrad %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <4 x i32> %b, + %splat = shufflevector <4 x i32> %mod, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_modulo_shift_v8i16: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v8i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psraw %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <8 x i16> %b, + %splat = shufflevector <8 x i16> %mod, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v16i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psrlw %xmm1, %xmm0 +; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: psrlw $8, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pand %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: pxor %xmm2, %xmm0 +; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <16 x i8> %b, + %splat = shufflevector <16 x i8> %mod, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i8> %a, %splat + ret <16 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 94b3b3a75a4c4..a9bcc73f97fe1 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1000,6 +1000,401 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ret <32 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshaq %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <4 x i64> %b, + %splat = shufflevector <4 x i64> %mod, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <8 x i32> %b, + %splat = shufflevector <8 x i32> %mod, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <16 x i16> %b, + %splat = shufflevector <16 x i16> %mod, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 +; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <32 x i8> %b, + %splat = shufflevector <32 x i8> %mod, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = ashr <32 x i8> %a, %splat + ret <32 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 6550968703ac2..bc878940f8839 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -226,6 +226,102 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ret <64 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v8i64: +; ALL: # %bb.0: +; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <8 x i64> %b, + %splat = shufflevector <8 x i64> %mod, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i64> %a, %splat + ret <8 x i64> %shift +} + +define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v16i32: +; ALL: # %bb.0: +; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <16 x i32> %b, + %splat = shufflevector <16 x i32> %mod, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i32> %a, %splat + ret <16 x i32> %shift +} + +define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <32 x i16> %b, + %splat = shufflevector <32 x i16> %mod, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = ashr <32 x i16> %a, %splat + ret <32 x i16> %shift +} + +define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpternlogq $108, %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <64 x i8> %b, + %splat = shufflevector <64 x i8> %mod, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = ashr <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 1d39c167c2802..76b32990a5299 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -766,6 +766,312 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ret <16 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE-LABEL: splatvar_modulo_shift_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psrlq %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_modulo_shift_v2i64: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v2i64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: psrlq %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <2 x i64> %b, + %splat = shufflevector <2 x i64> %mod, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: andl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psrld %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v4i32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: andl $31, %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: psrld %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <4 x i32> %b, + %splat = shufflevector <4 x i32> %mod, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_modulo_shift_v8i16: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v8i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psrlw %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <8 x i16> %b, + %splat = shufflevector <8 x i16> %mod, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v16i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psrlw %xmm1, %xmm0 +; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: psrlw $8, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <16 x i8> %b, + %splat = shufflevector <16 x i8> %mod, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i8> %a, %splat + ret <16 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 163486ad41352..44256db3979df 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -816,6 +816,345 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ret <32 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <4 x i64> %b, + %splat = shufflevector <4 x i64> %mod, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <8 x i32> %b, + %splat = shufflevector <8 x i32> %mod, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <16 x i16> %b, + %splat = shufflevector <16 x i16> %mod, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <32 x i8> %b, + %splat = shufflevector <32 x i8> %mod, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = lshr <32 x i8> %a, %splat + ret <32 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 1f5a20ddedb5c..ee2a38089ed49 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -181,6 +181,93 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ret <64 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v8i64: +; ALL: # %bb.0: +; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <8 x i64> %b, + %splat = shufflevector <8 x i64> %mod, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i64> %a, %splat + ret <8 x i64> %shift +} + +define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v16i32: +; ALL: # %bb.0: +; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <16 x i32> %b, + %splat = shufflevector <16 x i32> %mod, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i32> %a, %splat + ret <16 x i32> %shift +} + +define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <32 x i16> %b, + %splat = shufflevector <32 x i16> %mod, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = lshr <32 x i16> %a, %splat + ret <32 x i16> %shift +} + +define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <64 x i8> %b, + %splat = shufflevector <64 x i8> %mod, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = lshr <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 7626f2454a5f0..c10c22b472d11 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -671,6 +671,308 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ret <16 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE-LABEL: splatvar_modulo_shift_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psllq %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_modulo_shift_v2i64: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v2i64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: psllq %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <2 x i64> %b, + %splat = shufflevector <2 x i64> %mod, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = shl <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: andl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pslld %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v4i32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: andl $31, %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pslld %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <4 x i32> %b, + %splat = shufflevector <4 x i32> %mod, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_modulo_shift_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: splatvar_modulo_shift_v8i16: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v8i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psllw %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <8 x i16> %b, + %splat = shufflevector <8 x i16> %mod, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: splatvar_modulo_shift_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: psllw %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_modulo_shift_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: psllw %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_modulo_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X86-SSE-LABEL: splatvar_modulo_shift_v16i8: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: psllw %xmm1, %xmm0 +; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-SSE-NEXT: psllw %xmm1, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: retl + %mod = and <16 x i8> %b, + %splat = shufflevector <16 x i8> %mod, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i8> %a, %splat + ret <16 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index d151177d96c77..fd0ee8f051543 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -741,6 +741,340 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ret <32 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <4 x i64> %b, + %splat = shufflevector <4 x i64> %mod, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; XOPAVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <8 x i32> %b, + %splat = shufflevector <8 x i32> %mod, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; XOPAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: splatvar_modulo_shift_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: splatvar_modulo_shift_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <16 x i16> %b, + %splat = shufflevector <16 x i16> %mod, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: splatvar_modulo_shift_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq +; +; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl + %mod = and <32 x i8> %b, + %splat = shufflevector <32 x i8> %mod, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = shl <32 x i8> %a, %splat + ret <32 x i8> %shift +} + ; ; Constant Shifts ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index 45c653c3e78b9..655bd6319a6a2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -174,6 +174,91 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ret <64 x i8> %shift } +; +; Uniform Variable Modulo Shifts +; + +define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v8i64: +; ALL: # %bb.0: +; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <8 x i64> %b, + %splat = shufflevector <8 x i64> %mod, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i64> %a, %splat + ret <8 x i64> %shift +} + +define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; ALL-LABEL: splatvar_modulo_shift_v16i32: +; ALL: # %bb.0: +; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %mod = and <16 x i32> %b, + %splat = shufflevector <16 x i32> %mod, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i32> %a, %splat + ret <16 x i32> %shift +} + +define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <32 x i16> %b, + %splat = shufflevector <32 x i16> %mod, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = shl <32 x i16> %a, %splat + ret <32 x i16> %shift +} + +define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: splatvar_modulo_shift_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllw %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %mod = and <64 x i8> %b, + %splat = shufflevector <64 x i8> %mod, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = shl <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ;