Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}

if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
// On AVX512BW, we can use variable 16-bit shifts to implement variable
// 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
// The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
// of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
// can efficiently be merged together using a masked move.
MVT ExtVT = MVT::v32i16;

SDValue RLo, RHi;
// Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
// right shifting AmtHi.
SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
DAG.getConstant(0x00ff, dl, ExtVT));
SDValue AmtHi = getTargetVShiftByConstNode(
X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
switch (Opc) {
case ISD::SHL:
// Because we shift left, no bits from the high half can influence the low
// half, so we don't need to mask RLo. We do however need to mask RHi, to
// prevent high bits of an even lane overflowing into low bits of an odd
// lane.
RLo = DAG.getBitcast(ExtVT, R);
RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
DAG.getConstant(0xff00, dl, ExtVT));
break;
case ISD::SRL:
// Same idea as above, but this time we need to make sure no low bits of
// an odd lane can overflow into high bits of an even lane.
RHi = DAG.getBitcast(ExtVT, R);
RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
DAG.getConstant(0x00ff, dl, ExtVT));
break;
case ISD::SRA:
// For arithmetic right shifts, we want to sign extend each even lane of R
// such that the upper half of the corresponding lane of RLo is 0 or -1
// depending on the sign bit of the original lane. We do this using 2
// immediate shifts.
RHi = DAG.getBitcast(ExtVT, R);
RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
break;
default:
llvm_unreachable("Unexpected Shift Op");
}

SDValue ShiftedLo =
DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
SDValue ShiftedHi =
DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));

// To merge the shifted vectors back together, we select even lanes
// from ShiftedLo and odd lanes from ShiftedHi.
SDValue SelectMask = DAG.getBitcast(
MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
}

if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
Expand Down
75 changes: 27 additions & 48 deletions llvm/test/CodeGen/X86/gfni-shifts.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
Expand Down Expand Up @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
Expand Down Expand Up @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
Expand Down
40 changes: 10 additions & 30 deletions llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
Expand Down
22 changes: 8 additions & 14 deletions llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
Expand Down
19 changes: 7 additions & 12 deletions llvm/test/CodeGen/X86/vector-shift-shl-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
Expand Down
Loading