17 changes: 12 additions & 5 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2524,11 +2524,18 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_089abcde:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: valignd {{.*#+}} ymm1 = ymm1[7,0,1,2,3,4,5,6]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX512VL-NEXT: retq
; AVX512VL-SLOW-LABEL: shuffle_v8i32_089abcde:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: valignd {{.*#+}} ymm1 = ymm1[7,0,1,2,3,4,5,6]
; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_089abcde:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6]
; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
ret <8 x i32> %shuffle
}
Expand Down
45 changes: 38 additions & 7 deletions llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@ define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
ret <32 x i8> %2
}

define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vperm2i128:
define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vextracti128:
; X86: # %bb.0:
; X86-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
; X64-LABEL: combine_permq_pshufb_as_vextracti128:
; X64: # %bb.0:
; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
Expand All @@ -93,6 +93,26 @@ define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
ret <4 x i64> %5
}

define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
; X86-LABEL: combine_permq_pshufb_as_vmovdqa:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm0
; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vmovdqa:
; X64: # %bb.0:
; X64-NEXT: vmovdqa %xmm0, %xmm0
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
%4 = bitcast <32 x i8> %3 to <4 x i64>
%5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
ret <4 x i64> %5
}

define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
; CHECK-LABEL: combine_as_vpermd:
; CHECK: # %bb.0:
Expand All @@ -117,15 +137,26 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
ret <8 x float> %3
}

define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb_as_vmovaps:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
ret <32 x i8> %3
}

define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: ret{{[l|q]}}
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
ret <32 x i8> %3
}

Expand Down
10 changes: 6 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-v1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
; AVX512VL-NEXT: movq $-1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2]
; AVX512VL-NEXT: vpermi2q %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vptestmq %xmm3, %xmm3, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
;
Expand All @@ -74,8 +75,9 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ-NEXT: movq $-1, %rax
; VL_BW_DQ-NEXT: vmovq %rax, %xmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2]
; VL_BW_DQ-NEXT: vpermi2q %xmm0, %xmm1, %xmm2
; VL_BW_DQ-NEXT: vpmovq2m %xmm2, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
Expand Down