48 changes: 24 additions & 24 deletions llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ define <4 x float> @test_masked_4xfloat_perm_mask0(<4 x float> %vec, <4 x float>
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1]
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
Expand All @@ -30,7 +30,7 @@ define <4 x float> @test_masked_z_4xfloat_perm_mask0(<4 x float> %vec, <4 x floa
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
Expand All @@ -42,7 +42,7 @@ define <4 x float> @test_masked_4xfloat_perm_mask1(<4 x float> %vec, <4 x float>
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
Expand All @@ -56,7 +56,7 @@ define <4 x float> @test_masked_z_4xfloat_perm_mask1(<4 x float> %vec, <4 x floa
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
Expand All @@ -68,7 +68,7 @@ define <4 x float> @test_masked_4xfloat_perm_mask2(<4 x float> %vec, <4 x float>
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 1>
Expand All @@ -82,7 +82,7 @@ define <4 x float> @test_masked_z_4xfloat_perm_mask2(<4 x float> %vec, <4 x floa
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 1>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
Expand All @@ -102,7 +102,7 @@ define <4 x float> @test_masked_4xfloat_perm_mask3(<4 x float> %vec, <4 x float>
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
Expand All @@ -116,7 +116,7 @@ define <4 x float> @test_masked_z_4xfloat_perm_mask3(<4 x float> %vec, <4 x floa
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -292,7 +292,7 @@ define <8 x float> @test_masked_8xfloat_perm_imm_mask1(<8 x float> %vec, <8 x fl
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6]
; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6]
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 7, i32 6, i32 7, i32 6>
Expand All @@ -306,7 +306,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_imm_mask1(<8 x float> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 7, i32 6, i32 7, i32 6>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -352,7 +352,7 @@ define <8 x float> @test_masked_8xfloat_perm_imm_mask3(<8 x float> %vec, <8 x fl
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4]
; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4]
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
Expand All @@ -366,7 +366,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_imm_mask3(<8 x float> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -404,7 +404,7 @@ define <8 x float> @test_masked_8xfloat_perm_imm_mask5(<8 x float> %vec, <8 x fl
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7]
; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7]
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 3, i32 6, i32 5, i32 7, i32 7>
Expand All @@ -418,7 +418,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_imm_mask5(<8 x float> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 3, i32 6, i32 5, i32 7, i32 7>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -464,7 +464,7 @@ define <8 x float> @test_masked_8xfloat_perm_imm_mask7(<8 x float> %vec, <8 x fl
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5]
; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5]
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 4, i32 6, i32 5>
Expand All @@ -478,7 +478,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_imm_mask7(<8 x float> %vec, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 4, i32 6, i32 5>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -785,7 +785,7 @@ define <16 x float> @test_masked_16xfloat_perm_imm_mask1(<16 x float> %vec, <16
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 1, i32 6, i32 6, i32 6, i32 5, i32 10, i32 10, i32 10, i32 9, i32 14, i32 14, i32 14, i32 13>
Expand All @@ -799,7 +799,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_imm_mask1(<16 x float> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 1, i32 6, i32 6, i32 6, i32 5, i32 10, i32 10, i32 10, i32 9, i32 14, i32 14, i32 14, i32 13>
%cmp = fcmp oeq <16 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -845,7 +845,7 @@ define <16 x float> @test_masked_16xfloat_perm_imm_mask3(<16 x float> %vec, <16
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
Expand All @@ -859,7 +859,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_imm_mask3(<16 x float> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
%cmp = fcmp oeq <16 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -897,7 +897,7 @@ define <16 x float> @test_masked_16xfloat_perm_imm_mask5(<16 x float> %vec, <16
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 1, i32 0, i32 5, i32 6, i32 5, i32 4, i32 9, i32 10, i32 9, i32 8, i32 13, i32 14, i32 13, i32 12>
Expand All @@ -911,7 +911,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_imm_mask5(<16 x float> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 1, i32 0, i32 5, i32 6, i32 5, i32 4, i32 9, i32 10, i32 9, i32 8, i32 13, i32 14, i32 13, i32 12>
%cmp = fcmp oeq <16 x float> %mask, zeroinitializer
Expand Down Expand Up @@ -957,7 +957,7 @@ define <16 x float> @test_masked_16xfloat_perm_imm_mask7(<16 x float> %vec, <16
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 0, i32 2, i32 7, i32 7, i32 4, i32 6, i32 11, i32 11, i32 8, i32 10, i32 15, i32 15, i32 12, i32 14>
Expand All @@ -971,7 +971,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_imm_mask7(<16 x float> %vec, <1
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 0, i32 2, i32 7, i32 7, i32 4, i32 6, i32 11, i32 11, i32 8, i32 10, i32 15, i32 15, i32 12, i32 14>
%cmp = fcmp oeq <16 x float> %mask, zeroinitializer
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -903,15 +903,15 @@ define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16]
; X86-NEXT: vshufps $22, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xc8,0x16]
; X86-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4]
; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16]
; X64-NEXT: vshufps $22, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xc8,0x16]
; X64-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4]
; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
Expand All @@ -924,14 +924,14 @@ define <8 x float>@test_int_x86_avx512_maskz_vpermil_ps_256(<8 x float> %x0, i8
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16]
; X86-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0xc6,0xc0,0x16]
; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16]
; X64-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0xc6,0xc0,0x16]
; X64-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
Expand All @@ -955,15 +955,15 @@ define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16]
; X86-NEXT: vshufps $22, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xc8,0x16]
; X86-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0]
; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16]
; X64-NEXT: vshufps $22, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xc8,0x16]
; X64-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0]
; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
Expand All @@ -976,14 +976,14 @@ define <4 x float>@test_int_x86_avx512_maskz_vpermil_ps_128(<4 x float> %x0, i8
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16]
; X86-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0xc6,0xc0,0x16]
; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16]
; X64-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0xc6,0xc0,0x16]
; X64-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
Expand Down
180 changes: 144 additions & 36 deletions llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ define <16 x float> @transform_VPERMILPSZrrkz(<16 x float> %a, i16 %mask_int) no
; CHECK-LABEL: transform_VPERMILPSZrrkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%mask = bitcast i16 %mask_int to <16 x i1>
%shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
Expand All @@ -47,7 +47,7 @@ define <8 x float> @transform_VPERMILPSYrrkz(<8 x float> %a, i8 %mask_int) nounw
; CHECK-LABEL: transform_VPERMILPSYrrkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: retq
%mask = bitcast i8 %mask_int to <8 x i1>
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
Expand All @@ -59,7 +59,7 @@ define <4 x float> @transform_VPERMILPSrrkz(<4 x float> %a, i4 %mask_int) nounwi
; CHECK-LABEL: transform_VPERMILPSrrkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2,1,0]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
Expand All @@ -71,7 +71,7 @@ define <16 x float> @transform_VPERMILPSZrrk(<16 x float> %a, <16 x float> %b, i
; CHECK-LABEL: transform_VPERMILPSZrrk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = bitcast i16 %mask_int to <16 x i1>
Expand All @@ -84,7 +84,7 @@ define <8 x float> @transform_VPERMILPSYrrk(<8 x float> %a, <8 x float> %b, i8 %
; CHECK-LABEL: transform_VPERMILPSYrrk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = bitcast i8 %mask_int to <8 x i1>
Expand All @@ -97,7 +97,7 @@ define <4 x float> @transform_VPERMILPSrrk(<4 x float> %a, <4 x float> %b, i4 %m
; CHECK-LABEL: transform_VPERMILPSrrk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[3,2,1,0]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2,1,0]
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
Expand Down Expand Up @@ -182,11 +182,29 @@ define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
}

define <16 x float> @transform_VPERMILPSZrmkz(ptr %ap, i16 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSZrmkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSZrmkz:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSZrmkz:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSZrmkz:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmkz:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i16 %mask_int to <16 x i1>
%a = load <16 x float>, ptr %ap
%shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
Expand All @@ -195,11 +213,29 @@ define <16 x float> @transform_VPERMILPSZrmkz(ptr %ap, i16 %mask_int) nounwind {
}

define <8 x float> @transform_VPERMILPSYrmkz(ptr %ap, i8 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSYrmkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSYrmkz:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSYrmkz:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSYrmkz:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSYrmkz:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i8 %mask_int to <8 x i1>
%a = load <8 x float>, ptr %ap
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
Expand All @@ -208,11 +244,29 @@ define <8 x float> @transform_VPERMILPSYrmkz(ptr %ap, i8 %mask_int) nounwind {
}

define <4 x float> @transform_VPERMILPSrmkz(ptr %ap, i4 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSrmkz:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSrmkz:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSrmkz:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSrmkz:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSrmkz:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,2,1,0]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%a = load <4 x float>, ptr %ap
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
Expand All @@ -221,11 +275,29 @@ define <4 x float> @transform_VPERMILPSrmkz(ptr %ap, i4 %mask_int) nounwind {
}

define <16 x float> @transform_VPERMILPSZrmk(ptr %ap, <16 x float> %b, i16 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSZrmk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSZrmk:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSZrmk:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSZrmk:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrmk:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i16 %mask_int to <16 x i1>
%a = load <16 x float>, ptr %ap
%shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
Expand All @@ -234,11 +306,29 @@ define <16 x float> @transform_VPERMILPSZrmk(ptr %ap, <16 x float> %b, i16 %mask
}

define <8 x float> @transform_VPERMILPSYrmk(ptr %ap, <8 x float> %b, i8 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSYrmk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSYrmk:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSYrmk:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSYrmk:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSYrmk:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i8 %mask_int to <8 x i1>
%a = load <8 x float>, ptr %ap
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
Expand All @@ -247,11 +337,29 @@ define <8 x float> @transform_VPERMILPSYrmk(ptr %ap, <8 x float> %b, i8 %mask_in
}

define <4 x float> @transform_VPERMILPSrmk(ptr %ap, <4 x float> %b, i4 %mask_int) nounwind {
; CHECK-LABEL: transform_VPERMILPSrmk:
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
; CHECK-NEXT: retq
; CHECK-ICX-LABEL: transform_VPERMILPSrmk:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: kmovd %esi, %k1
; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
; CHECK-ICX-NEXT: retq
;
; CHECK-V4-LABEL: transform_VPERMILPSrmk:
; CHECK-V4: # %bb.0:
; CHECK-V4-NEXT: kmovd %esi, %k1
; CHECK-V4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
; CHECK-V4-NEXT: retq
;
; CHECK-AVX512-LABEL: transform_VPERMILPSrmk:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: kmovd %esi, %k1
; CHECK-AVX512-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
; CHECK-AVX512-NEXT: retq
;
; CHECK-ZNVER4-LABEL: transform_VPERMILPSrmk:
; CHECK-ZNVER4: # %bb.0:
; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,2,1,0]
; CHECK-ZNVER4-NEXT: retq
%mask = bitcast i4 %mask_int to <4 x i1>
%a = load <4 x float>, ptr %ap
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
Expand Down
362 changes: 296 additions & 66 deletions llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll

Large diffs are not rendered by default.

57 changes: 39 additions & 18 deletions llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
Original file line number Diff line number Diff line change
@@ -1,39 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,CHECK-SKL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-V3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2,CHECK-SKL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2,CHECK-V3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-ICX

define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
; CHECK-LABEL: transform_VUNPCKLPDYrr:
; CHECK: # %bb.0:
; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-NEXT: retq
; CHECK-AVX2-LABEL: transform_VUNPCKLPDYrr:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-AVX2-NEXT: retq
;
; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5]
; CHECK-ICX-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
ret <8 x float> %shufp
}

define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
; CHECK-LABEL: transform_VUNPCKHPDYrr:
; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-NEXT: retq
; CHECK-AVX2-LABEL: transform_VUNPCKHPDYrr:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-AVX2-NEXT: retq
;
; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7]
; CHECK-ICX-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
ret <8 x float> %shufp
}

define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwind {
; CHECK-LABEL: transform_VUNPCKLPDrr:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
; CHECK-AVX2-LABEL: transform_VUNPCKLPDrr:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-AVX2-NEXT: retq
;
; CHECK-ICX-LABEL: transform_VUNPCKLPDrr:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
; CHECK-ICX-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shufp
}

define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwind {
; CHECK-LABEL: transform_VUNPCKHPDrr:
; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: retq
; CHECK-AVX2-LABEL: transform_VUNPCKHPDrr:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-AVX2-NEXT: retq
;
; CHECK-ICX-LABEL: transform_VUNPCKHPDrr:
; CHECK-ICX: # %bb.0:
; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3]
; CHECK-ICX-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x float> %shufp
}
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
Original file line number Diff line number Diff line change
Expand Up @@ -362,19 +362,19 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <1
; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X86-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: kmovw %edi, %k1
; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: kmovd %edi, %k1
; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-AVX512BW-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
Expand Down Expand Up @@ -543,19 +543,19 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <
; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X86-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: kmovw %edi, %k1
; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: kmovd %edi, %k1
; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-AVX512BW-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
Expand Down