@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
target triple = "x86_64-unknown-unknown"
Expand All
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
}
define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08 (<16 x float > %a , <16 x float > %b ) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; ALL: # %bb.0:
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <16 x float > %a , <16 x float > %b , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
ret <16 x float > %shuffle
}
define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc (<16 x i32 > %a , <16 x i32 > %b ) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; ALL: # %bb.0:
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%tmp0 = bitcast <16 x i32 > %a to <16 x float >
%tmp1 = bitcast <16 x i32 > %b to <16 x float >
%shuffle = shufflevector <16 x float > %tmp0 , <16 x float > %tmp1 , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
Expand Down
Expand Up
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
; PR86076
define <16 x float > @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08 (float %a0 , float %a1 ) {
; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; ALL: # %bb.0:
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; SLOW: # %bb.0:
; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; FAST-NEXT: retq
%v0 = insertelement <8 x float > poison, float %a0 , i64 0
%v1 = insertelement <8 x float > poison, float %a1 , i64 0
%sv = shufflevector <8 x float > %v0 , <8 x float > %v1 , <16 x i32 > <i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 >
Expand All
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
}
define <16 x i32 > @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<16 x i32 > %a , <16 x i32 > %b ) {
; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <16 x i32 > %a , <16 x i32 > %b , <16 x i32 ><i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
ret <16 x i32 > %shuffle
}
Expand Down
Expand Up
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
; PR46249
define <16 x i32 > @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x i32 > %a ) {
; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; ALL: # %bb.0:
; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x i32 > %a , <16 x i32 > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
ret <16 x i32 > %1
}
define <16 x float > @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x float > %a ) {
; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x float > %a , <16 x float > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
ret <16 x float > %1
}
Expand All
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
}
define <16 x float > @shuffle_v16f32_load_08_11_10_00_12_15_14_04 (<16 x float > %a0 , ptr %a1 ) {
; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; FAST-NEXT: retq
%1 = load <16 x float >, ptr %a1
%2 = shufflevector <16 x float > %1 , <16 x float > %a0 , <16 x i32 > <i32 16 , i32 19 , i32 18 , i32 0 , i32 20 , i32 23 , i32 22 , i32 4 , i32 24 , i32 27 , i32 26 , i32 8 , i32 28 , i32 31 , i32 30 , i32 12 >
ret <16 x float > %2
Expand Down
Expand Up
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
;FIXME: can do better with vpcompress
define <8 x i32 > @test_v16i32_1_3_5_7_9_11_13_15 (<16 x i32 > %v ) {
; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; ALL: # %bb.0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; ALL-NEXT: retq
; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; FAST-NEXT: retq
%res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
ret <8 x i32 > %res
}
;FIXME: can do better with vpcompress
define <4 x i32 > @test_v16i32_0_1_2_12 (<16 x i32 > %v ) {
; ALL-LABEL: test_v16i32_0_1_2_12:
; ALL: # %bb.0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
; ALL-NEXT: vbroadcastss %xmm1, %xmm1
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
; SLOW-LABEL: test_v16i32_0_1_2_12:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SLOW-NEXT: vzeroupper
; SLOW-NEXT: retq
;
; FAST-LABEL: test_v16i32_0_1_2_12:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FAST-NEXT: vzeroupper
; FAST-NEXT: retq
%res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 12 >
ret <4 x i32 > %res
}
Expand Down
Expand Up
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
}
define <16 x float > @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<8 x float > %a ) {
; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <8 x float > %a , <8 x float > undef , <16 x i32 > <i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
ret <16 x float > %shuffle
}
Expand Down