Skip to content

Commit

Permalink
[X86] vector-shuffle-512-v16.ll - add fast shuffle test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
RKSimon committed May 29, 2024
1 parent 9c42ed1 commit f42de69
Showing 1 changed file with 125 additions and 56 deletions.
181 changes: 125 additions & 56 deletions llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW

target triple = "x86_64-unknown-unknown"

Expand All @@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
}

define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; ALL: # %bb.0:
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x float> %shuffle
}

define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; ALL: # %bb.0:
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%tmp0 = bitcast <16 x i32> %a to <16 x float>
%tmp1 = bitcast <16 x i32> %b to <16 x float>
%shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
Expand Down Expand Up @@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1

; PR86076
define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) {
; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; ALL: # %bb.0:
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; SLOW: # %bb.0:
; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; FAST-NEXT: retq
%v0 = insertelement <8 x float> poison, float %a0, i64 0
%v1 = insertelement <8 x float> poison, float %a1, i64 0
%sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
Expand All @@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
}

define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; FAST: # %bb.0:
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <16 x i32> %shuffle
}
Expand Down Expand Up @@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08

; PR46249
define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; ALL: # %bb.0:
; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <16 x i32> %1
}

define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <16 x float> %1
}
Expand All @@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
}

define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) {
; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; FAST-NEXT: retq
%1 = load <16 x float>, ptr %a1
%2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
ret <16 x float> %2
Expand Down Expand Up @@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a

;FIXME: can do better with vpcompress
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; ALL: # %bb.0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; ALL-NEXT: retq
; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; SLOW-NEXT: retq
;
; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; FAST-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x i32> %res
}

;FIXME: can do better with vpcompress
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
; ALL-LABEL: test_v16i32_0_1_2_12:
; ALL: # %bb.0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
; ALL-NEXT: vbroadcastss %xmm1, %xmm1
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
; SLOW-LABEL: test_v16i32_0_1_2_12:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SLOW-NEXT: vzeroupper
; SLOW-NEXT: retq
;
; FAST-LABEL: test_v16i32_0_1_2_12:
; FAST: # %bb.0:
; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FAST-NEXT: vzeroupper
; FAST-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
ret <4 x i32> %res
}
Expand Down Expand Up @@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
}

define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; SLOW: # %bb.0:
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
; SLOW-NEXT: retq
;
; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <16 x float> %shuffle
}
Expand Down

0 comments on commit f42de69

Please sign in to comment.