Skip to content

Commit 011bf4f

Browse files
committed
[X86][AVX] lowerShuffleWithVTRUNC - extend to support v16i16/v32i8 binary shuffles.
This requires a few additional SrcVT vs DstVT padding cases in getAVX512TruncNode.
1 parent c98fcba commit 011bf4f

File tree

4 files changed

+79
-279
lines changed

4 files changed

+79
-279
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11292,29 +11292,42 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
1129211292
const X86Subtarget &Subtarget,
1129311293
SelectionDAG &DAG, bool ZeroUppers) {
1129411294
MVT SrcVT = Src.getSimpleValueType();
11295+
MVT DstSVT = DstVT.getScalarType();
1129511296
unsigned NumDstElts = DstVT.getVectorNumElements();
1129611297
unsigned NumSrcElts = SrcVT.getVectorNumElements();
11298+
unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
1129711299

1129811300
// Perform a direct ISD::TRUNCATE if possible.
1129911301
if (NumSrcElts == NumDstElts)
1130011302
return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
1130111303

1130211304
if (NumSrcElts > NumDstElts) {
11303-
MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
11305+
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
1130411306
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
1130511307
return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
1130611308
}
1130711309

11310+
if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11311+
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11312+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11313+
return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11314+
DstVT.getSizeInBits());
11315+
}
11316+
1130811317
// Non-VLX targets must truncate from a 512-bit type, so we need to
1130911318
// widen, truncate and then possibly extract the original subvector.
1131011319
if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
1131111320
SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
1131211321
return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
1131311322
}
1131411323

11315-
// Fallback to a X86ISD::VTRUNC.
11316-
// TODO: Handle cases where we go from 512-bit vectors to sub-128-bit vectors.
11317-
return DAG.getNode(X86ISD::VTRUNC, DL, DstVT, Src);
11324+
// Fallback to a X86ISD::VTRUNC, padding if necessary.
11325+
MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11326+
SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11327+
if (DstVT != TruncVT)
11328+
Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11329+
DstVT.getSizeInBits());
11330+
return Trunc;
1131811331
}
1131911332

1132011333
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
@@ -11413,7 +11426,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
1141311426
const APInt &Zeroable,
1141411427
const X86Subtarget &Subtarget,
1141511428
SelectionDAG &DAG) {
11416-
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11429+
assert((VT.is128BitVector() || VT.is256BitVector()) &&
11430+
"Unexpected VTRUNC type");
1141711431
if (!Subtarget.hasAVX512())
1141811432
return SDValue();
1141911433

@@ -16893,6 +16907,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1689316907
Subtarget))
1689416908
return V;
1689516909

16910+
// Try to use lower using a truncation.
16911+
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16912+
Subtarget, DAG))
16913+
return V;
16914+
1689616915
// Try to use shift instructions.
1689716916
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
1689816917
Zeroable, Subtarget, DAG))
@@ -17003,6 +17022,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1700317022
Subtarget))
1700417023
return V;
1700517024

17025+
// Try to use lower using a truncation.
17026+
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17027+
Subtarget, DAG))
17028+
return V;
17029+
1700617030
// Try to use shift instructions.
1700717031
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
1700817032
Zeroable, Subtarget, DAG))

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

Lines changed: 19 additions & 232 deletions
Original file line numberDiff line numberDiff line change
@@ -176,89 +176,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
176176
}
177177

178178
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
179-
; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
180-
; AVX512F: # %bb.0:
181-
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
182-
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
183-
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
184-
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
185-
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
186-
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
187-
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
188-
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
189-
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
190-
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
191-
; AVX512F-NEXT: vzeroupper
192-
; AVX512F-NEXT: retq
193-
;
194-
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
195-
; AVX512VL: # %bb.0:
196-
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
197-
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
198-
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
199-
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
200-
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
201-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
202-
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
203-
; AVX512VL-NEXT: vpmovdb %ymm1, %xmm1
204-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
205-
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
206-
; AVX512VL-NEXT: vzeroupper
207-
; AVX512VL-NEXT: retq
208-
;
209-
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
210-
; AVX512BW: # %bb.0:
211-
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
212-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
213-
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
214-
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
215-
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
216-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
217-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
218-
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
219-
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
220-
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
221-
; AVX512BW-NEXT: vzeroupper
222-
; AVX512BW-NEXT: retq
223-
;
224-
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
225-
; AVX512BWVL: # %bb.0:
226-
; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm0
227-
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
228-
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
229-
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
230-
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
231-
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
232-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
233-
; AVX512BWVL-NEXT: vpmovdb %ymm1, %xmm1
234-
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
235-
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
236-
; AVX512BWVL-NEXT: vzeroupper
237-
; AVX512BWVL-NEXT: retq
238-
;
239-
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
240-
; AVX512VBMI: # %bb.0:
241-
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
242-
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
243-
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
244-
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
245-
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
246-
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
247-
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
248-
; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
249-
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
250-
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
251-
; AVX512VBMI-NEXT: vzeroupper
252-
; AVX512VBMI-NEXT: retq
253-
;
254-
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
255-
; AVX512VBMIVL: # %bb.0:
256-
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
257-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
258-
; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
259-
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
260-
; AVX512VBMIVL-NEXT: vzeroupper
261-
; AVX512VBMIVL-NEXT: retq
179+
; AVX512-LABEL: shuffle_v64i8_to_v16i8:
180+
; AVX512: # %bb.0:
181+
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
182+
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
183+
; AVX512-NEXT: vzeroupper
184+
; AVX512-NEXT: retq
262185
%vec = load <64 x i8>, <64 x i8>* %L
263186
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
264187
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -280,80 +203,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
280203
}
281204

282205
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
283-
; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
284-
; AVX512F: # %bb.0:
285-
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
286-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
287-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
288-
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
289-
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
290-
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
291-
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
292-
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
293-
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
294-
; AVX512F-NEXT: vzeroupper
295-
; AVX512F-NEXT: retq
296-
;
297-
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
298-
; AVX512VL: # %bb.0:
299-
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
300-
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
301-
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
302-
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
303-
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
304-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
305-
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
306-
; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
307-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
308-
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
309-
; AVX512VL-NEXT: vzeroupper
310-
; AVX512VL-NEXT: retq
311-
;
312-
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
313-
; AVX512BW: # %bb.0:
314-
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
315-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
316-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
317-
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
318-
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
319-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
320-
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
321-
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
322-
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
323-
; AVX512BW-NEXT: vzeroupper
324-
; AVX512BW-NEXT: retq
325-
;
326-
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
327-
; AVX512BWVL: # %bb.0:
328-
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
329-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
330-
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
331-
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
332-
; AVX512BWVL-NEXT: vzeroupper
333-
; AVX512BWVL-NEXT: retq
334-
;
335-
; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
336-
; AVX512VBMI: # %bb.0:
337-
; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
338-
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
339-
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
340-
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
341-
; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
342-
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
343-
; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
344-
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
345-
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
346-
; AVX512VBMI-NEXT: vzeroupper
347-
; AVX512VBMI-NEXT: retq
348-
;
349-
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
350-
; AVX512VBMIVL: # %bb.0:
351-
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
352-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
353-
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
354-
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
355-
; AVX512VBMIVL-NEXT: vzeroupper
356-
; AVX512VBMIVL-NEXT: retq
206+
; AVX512-LABEL: shuffle_v32i16_to_v8i16:
207+
; AVX512: # %bb.0:
208+
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
209+
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
210+
; AVX512-NEXT: vzeroupper
211+
; AVX512-NEXT: retq
357212
%vec = load <32 x i16>, <32 x i16>* %L
358213
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
359214
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -375,81 +230,13 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
375230
}
376231

377232
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
378-
; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
379-
; AVX512F: # %bb.0:
380-
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
381-
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
382-
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
383-
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
384-
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
385-
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
386-
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
387-
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
388-
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
389-
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
390-
; AVX512F-NEXT: vzeroupper
391-
; AVX512F-NEXT: retq
392-
;
393-
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
394-
; AVX512VL: # %bb.0:
395-
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
396-
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
397-
; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
398-
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
399-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
400-
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
401-
; AVX512VL-NEXT: vzeroupper
402-
; AVX512VL-NEXT: retq
403-
;
404-
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
405-
; AVX512BW: # %bb.0:
406-
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
407-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
408-
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
409-
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
410-
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
411-
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
412-
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
413-
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
414-
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
415-
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
416-
; AVX512BW-NEXT: vzeroupper
417-
; AVX512BW-NEXT: retq
418-
;
419-
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
420-
; AVX512BWVL: # %bb.0:
421-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
422-
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
423-
; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
424-
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
425-
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
426-
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
427-
; AVX512BWVL-NEXT: vzeroupper
428-
; AVX512BWVL-NEXT: retq
429-
;
430-
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
431-
; AVX512VBMI: # %bb.0:
432-
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
433-
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
434-
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
435-
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
436-
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
437-
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
438-
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
439-
; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
440-
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
441-
; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
442-
; AVX512VBMI-NEXT: vzeroupper
443-
; AVX512VBMI-NEXT: retq
444-
;
445-
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
446-
; AVX512VBMIVL: # %bb.0:
447-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
448-
; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
449-
; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
450-
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
451-
; AVX512VBMIVL-NEXT: vzeroupper
452-
; AVX512VBMIVL-NEXT: retq
233+
; AVX512-LABEL: shuffle_v64i8_to_v8i8:
234+
; AVX512: # %bb.0:
235+
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
236+
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
237+
; AVX512-NEXT: vmovq %xmm0, (%rsi)
238+
; AVX512-NEXT: vzeroupper
239+
; AVX512-NEXT: retq
453240
%vec = load <64 x i8>, <64 x i8>* %L
454241
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
455242
store <8 x i8> %strided.vec, <8 x i8>* %S

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4843,19 +4843,13 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
48434843
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
48444844
; AVX2-NEXT: retq
48454845
;
4846-
; AVX512VLBW-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
4847-
; AVX512VLBW: # %bb.0:
4848-
; AVX512VLBW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4849-
; AVX512VLBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4850-
; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
4851-
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
4852-
; AVX512VLBW-NEXT: retq
4853-
;
4854-
; AVX512VLVBMI-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
4855-
; AVX512VLVBMI: # %bb.0:
4856-
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
4857-
; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
4858-
; AVX512VLVBMI-NEXT: retq
4846+
; AVX512VL-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
4847+
; AVX512VL: # %bb.0:
4848+
; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
4849+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4850+
; AVX512VL-NEXT: vpsrlw $8, %zmm0, %zmm0
4851+
; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
4852+
; AVX512VL-NEXT: retq
48594853
;
48604854
; XOPAVX1-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
48614855
; XOPAVX1: # %bb.0:

0 commit comments

Comments
 (0)