@@ -176,89 +176,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
176
176
}
177
177
178
178
define void @shuffle_v64i8_to_v16i8 (<64 x i8 >* %L , <16 x i8 >* %S ) nounwind {
179
- ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
180
- ; AVX512F: # %bb.0:
181
- ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
182
- ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
183
- ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
184
- ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
185
- ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
186
- ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
187
- ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
188
- ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
189
- ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
190
- ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
191
- ; AVX512F-NEXT: vzeroupper
192
- ; AVX512F-NEXT: retq
193
- ;
194
- ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
195
- ; AVX512VL: # %bb.0:
196
- ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
197
- ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
198
- ; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
199
- ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
200
- ; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
201
- ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
202
- ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
203
- ; AVX512VL-NEXT: vpmovdb %ymm1, %xmm1
204
- ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
205
- ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
206
- ; AVX512VL-NEXT: vzeroupper
207
- ; AVX512VL-NEXT: retq
208
- ;
209
- ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
210
- ; AVX512BW: # %bb.0:
211
- ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
212
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
213
- ; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
214
- ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
215
- ; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
216
- ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
217
- ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
218
- ; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
219
- ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
220
- ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
221
- ; AVX512BW-NEXT: vzeroupper
222
- ; AVX512BW-NEXT: retq
223
- ;
224
- ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
225
- ; AVX512BWVL: # %bb.0:
226
- ; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm0
227
- ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
228
- ; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
229
- ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
230
- ; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
231
- ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
232
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
233
- ; AVX512BWVL-NEXT: vpmovdb %ymm1, %xmm1
234
- ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
235
- ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
236
- ; AVX512BWVL-NEXT: vzeroupper
237
- ; AVX512BWVL-NEXT: retq
238
- ;
239
- ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
240
- ; AVX512VBMI: # %bb.0:
241
- ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
242
- ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
243
- ; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
244
- ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
245
- ; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
246
- ; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
247
- ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
248
- ; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
249
- ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
250
- ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
251
- ; AVX512VBMI-NEXT: vzeroupper
252
- ; AVX512VBMI-NEXT: retq
253
- ;
254
- ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
255
- ; AVX512VBMIVL: # %bb.0:
256
- ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
257
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
258
- ; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
259
- ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
260
- ; AVX512VBMIVL-NEXT: vzeroupper
261
- ; AVX512VBMIVL-NEXT: retq
179
+ ; AVX512-LABEL: shuffle_v64i8_to_v16i8:
180
+ ; AVX512: # %bb.0:
181
+ ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
182
+ ; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
183
+ ; AVX512-NEXT: vzeroupper
184
+ ; AVX512-NEXT: retq
262
185
%vec = load <64 x i8 >, <64 x i8 >* %L
263
186
%strided.vec = shufflevector <64 x i8 > %vec , <64 x i8 > undef , <16 x i32 > <i32 0 , i32 4 , i32 8 , i32 12 , i32 16 , i32 20 , i32 24 , i32 28 , i32 32 , i32 36 , i32 40 , i32 44 , i32 48 , i32 52 , i32 56 , i32 60 >
264
187
store <16 x i8 > %strided.vec , <16 x i8 >* %S
@@ -280,80 +203,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
280
203
}
281
204
282
205
define void @shuffle_v32i16_to_v8i16 (<32 x i16 >* %L , <8 x i16 >* %S ) nounwind {
283
- ; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
284
- ; AVX512F: # %bb.0:
285
- ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
286
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
287
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
288
- ; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
289
- ; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
290
- ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
291
- ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
292
- ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
293
- ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
294
- ; AVX512F-NEXT: vzeroupper
295
- ; AVX512F-NEXT: retq
296
- ;
297
- ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
298
- ; AVX512VL: # %bb.0:
299
- ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
300
- ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
301
- ; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
302
- ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
303
- ; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
304
- ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
305
- ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
306
- ; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
307
- ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
308
- ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
309
- ; AVX512VL-NEXT: vzeroupper
310
- ; AVX512VL-NEXT: retq
311
- ;
312
- ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
313
- ; AVX512BW: # %bb.0:
314
- ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
315
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
316
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
317
- ; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
318
- ; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
319
- ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
320
- ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
321
- ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
322
- ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
323
- ; AVX512BW-NEXT: vzeroupper
324
- ; AVX512BW-NEXT: retq
325
- ;
326
- ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
327
- ; AVX512BWVL: # %bb.0:
328
- ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
329
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
330
- ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
331
- ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
332
- ; AVX512BWVL-NEXT: vzeroupper
333
- ; AVX512BWVL-NEXT: retq
334
- ;
335
- ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
336
- ; AVX512VBMI: # %bb.0:
337
- ; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
338
- ; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
339
- ; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
340
- ; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
341
- ; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
342
- ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
343
- ; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
344
- ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
345
- ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
346
- ; AVX512VBMI-NEXT: vzeroupper
347
- ; AVX512VBMI-NEXT: retq
348
- ;
349
- ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
350
- ; AVX512VBMIVL: # %bb.0:
351
- ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
352
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
353
- ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
354
- ; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
355
- ; AVX512VBMIVL-NEXT: vzeroupper
356
- ; AVX512VBMIVL-NEXT: retq
206
+ ; AVX512-LABEL: shuffle_v32i16_to_v8i16:
207
+ ; AVX512: # %bb.0:
208
+ ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
209
+ ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
210
+ ; AVX512-NEXT: vzeroupper
211
+ ; AVX512-NEXT: retq
357
212
%vec = load <32 x i16 >, <32 x i16 >* %L
358
213
%strided.vec = shufflevector <32 x i16 > %vec , <32 x i16 > undef , <8 x i32 > <i32 0 , i32 4 , i32 8 , i32 12 , i32 16 , i32 20 , i32 24 , i32 28 >
359
214
store <8 x i16 > %strided.vec , <8 x i16 >* %S
@@ -375,81 +230,13 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
375
230
}
376
231
377
232
define void @shuffle_v64i8_to_v8i8 (<64 x i8 >* %L , <8 x i8 >* %S ) nounwind {
378
- ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
379
- ; AVX512F: # %bb.0:
380
- ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
381
- ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
382
- ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
383
- ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
384
- ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
385
- ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
386
- ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
387
- ; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
388
- ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
389
- ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
390
- ; AVX512F-NEXT: vzeroupper
391
- ; AVX512F-NEXT: retq
392
- ;
393
- ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
394
- ; AVX512VL: # %bb.0:
395
- ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
396
- ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
397
- ; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
398
- ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
399
- ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
400
- ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
401
- ; AVX512VL-NEXT: vzeroupper
402
- ; AVX512VL-NEXT: retq
403
- ;
404
- ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
405
- ; AVX512BW: # %bb.0:
406
- ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
407
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
408
- ; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
409
- ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
410
- ; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
411
- ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
412
- ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
413
- ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
414
- ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
415
- ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
416
- ; AVX512BW-NEXT: vzeroupper
417
- ; AVX512BW-NEXT: retq
418
- ;
419
- ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
420
- ; AVX512BWVL: # %bb.0:
421
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
422
- ; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
423
- ; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
424
- ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
425
- ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
426
- ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
427
- ; AVX512BWVL-NEXT: vzeroupper
428
- ; AVX512BWVL-NEXT: retq
429
- ;
430
- ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
431
- ; AVX512VBMI: # %bb.0:
432
- ; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
433
- ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
434
- ; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
435
- ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
436
- ; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
437
- ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
438
- ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
439
- ; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
440
- ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
441
- ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
442
- ; AVX512VBMI-NEXT: vzeroupper
443
- ; AVX512VBMI-NEXT: retq
444
- ;
445
- ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
446
- ; AVX512VBMIVL: # %bb.0:
447
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
448
- ; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
449
- ; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
450
- ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
451
- ; AVX512VBMIVL-NEXT: vzeroupper
452
- ; AVX512VBMIVL-NEXT: retq
233
+ ; AVX512-LABEL: shuffle_v64i8_to_v8i8:
234
+ ; AVX512: # %bb.0:
235
+ ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
236
+ ; AVX512-NEXT: vpmovqb %zmm0, %xmm0
237
+ ; AVX512-NEXT: vmovq %xmm0, (%rsi)
238
+ ; AVX512-NEXT: vzeroupper
239
+ ; AVX512-NEXT: retq
453
240
%vec = load <64 x i8 >, <64 x i8 >* %L
454
241
%strided.vec = shufflevector <64 x i8 > %vec , <64 x i8 > undef , <8 x i32 > <i32 0 , i32 8 , i32 16 , i32 24 , i32 32 , i32 40 , i32 48 , i32 56 >
455
242
store <8 x i8 > %strided.vec , <8 x i8 >* %S
0 commit comments