diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6edf0185df813..6a9a2d9d687a3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dcad243a..30f1874c51fed 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420bb2609..aff2228c258b5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..4450d07e01cca 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd742956ed09..41238acc4b74d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift