Skip to content

Commit 84e46aa

Browse files
authored
[X86] combineConcatVectorOps - add handling to concat setcc instructions together (#170295)
So far this only handles AVX512 predicate masks, which is by far the easiest to support - AVX1/AVX2 support can mostly be dealt with via CMPP + CMPEQ/GT nodes (but these still fail for some icmp expansions where nodes have multiple uses).
1 parent d6f9205 commit 84e46aa

File tree

5 files changed

+149
-162
lines changed

5 files changed

+149
-162
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59427,6 +59427,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5942759427
}
5942859428
}
5942959429
break;
59430+
case ISD::SETCC:
59431+
if (!IsSplat && EltSizeInBits == 1 &&
59432+
llvm::all_of(Ops, [Op0](SDValue Op) {
59433+
return Op0.getOperand(0).getValueType() ==
59434+
Op.getOperand(0).getValueType() &&
59435+
Op0.getOperand(2) == Op.getOperand(2);
59436+
})) {
59437+
EVT SrcVT = Op0.getOperand(0).getValueType();
59438+
EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(),
59439+
NumOps * SrcVT.getVectorNumElements());
59440+
unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits();
59441+
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) &&
59442+
(NewSrcVT.is256BitVector() ||
59443+
(NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() &&
59444+
(SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59445+
SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0);
59446+
SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1);
59447+
if (LHS || RHS)
59448+
return DAG.getNode(Opcode, DL, VT,
59449+
LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0),
59450+
RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1),
59451+
Op0.getOperand(2));
59452+
}
59453+
}
59454+
break;
5943059455
case ISD::CTPOP:
5943159456
case ISD::CTTZ:
5943259457
case ISD::CTLZ:
@@ -59791,13 +59816,16 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
5979159816
}
5979259817
}
5979359818

59794-
// Attempt to merge logic ops if the type is legal.
59795-
if (TLI.isTypeLegal(VT) && all_of(Ops, [](SDValue Op) {
59796-
return ISD::isBitwiseLogicOp(Op.getOpcode());
59797-
}))
59819+
// Attempt to merge comparison/logic ops if the type is legal.
59820+
if (TLI.isTypeLegal(VT) &&
59821+
(all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) ||
59822+
all_of(Ops, [](SDValue Op) {
59823+
return ISD::isBitwiseLogicOp(Op.getOpcode());
59824+
}))) {
5979859825
if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops,
5979959826
DAG, Subtarget))
5980059827
return R;
59828+
}
5980159829

5980259830
// Don't do anything else for i1 vectors.
5980359831
return SDValue();

llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,12 @@ define <8 x i1> @test3(<4 x i1> %a) {
5252
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
5353
; CHECK-LABEL: test4:
5454
; CHECK: # %bb.0:
55-
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
56-
; CHECK-NEXT: vpmovd2m %xmm1, %k0
57-
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
58-
; CHECK-NEXT: vpmovd2m %xmm0, %k1
59-
; CHECK-NEXT: kshiftlb $4, %k0, %k0
60-
; CHECK-NEXT: korb %k0, %k1, %k0
55+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
56+
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
57+
; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
58+
; CHECK-NEXT: vpmovd2m %ymm0, %k0
6159
; CHECK-NEXT: vpmovm2w %k0, %xmm0
60+
; CHECK-NEXT: vzeroupper
6261
; CHECK-NEXT: retq
6362

6463
%res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -68,13 +67,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
6867
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
6968
; CHECK-LABEL: test5:
7069
; CHECK: # %bb.0:
71-
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
72-
; CHECK-NEXT: vpmovq2m %xmm1, %k0
73-
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
74-
; CHECK-NEXT: vpmovq2m %xmm0, %k1
75-
; CHECK-NEXT: kshiftlb $2, %k0, %k0
76-
; CHECK-NEXT: korw %k0, %k1, %k0
70+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
71+
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
72+
; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
73+
; CHECK-NEXT: vpmovq2m %ymm0, %k0
7774
; CHECK-NEXT: vpmovm2d %k0, %xmm0
75+
; CHECK-NEXT: vzeroupper
7876
; CHECK-NEXT: retq
7977

8078
%res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

llvm/test/CodeGen/X86/combine-icmp.ll

Lines changed: 53 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,12 @@ define i8 @concat_icmp_v8i32_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
8383
;
8484
; AVX512-LABEL: concat_icmp_v8i32_v4i32:
8585
; AVX512: # %bb.0:
86-
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
87-
; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1
88-
; AVX512-NEXT: kshiftlb $4, %k1, %k1
89-
; AVX512-NEXT: korb %k1, %k0, %k0
86+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
87+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
88+
; AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k0
9089
; AVX512-NEXT: kmovd %k0, %eax
9190
; AVX512-NEXT: # kill: def $al killed $al killed $eax
91+
; AVX512-NEXT: vzeroupper
9292
; AVX512-NEXT: retq
9393
%v0 = icmp eq <4 x i32> %a0, zeroinitializer
9494
%v1 = icmp eq <4 x i32> %a1, zeroinitializer
@@ -151,12 +151,12 @@ define i16 @concat_icmp_v16i16_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
151151
;
152152
; AVX512-LABEL: concat_icmp_v16i16_v8i16:
153153
; AVX512: # %bb.0:
154-
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
155-
; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm0, %k0
156-
; AVX512-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
157-
; AVX512-NEXT: kunpckbw %k0, %k1, %k0
154+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
155+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
156+
; AVX512-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
158157
; AVX512-NEXT: kmovd %k0, %eax
159158
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
159+
; AVX512-NEXT: vzeroupper
160160
; AVX512-NEXT: retq
161161
%v0 = icmp ugt <8 x i16> %a0, splat (i16 1)
162162
%v1 = icmp ugt <8 x i16> %a1, splat (i16 1)
@@ -199,11 +199,11 @@ define i32 @concat_icmp_v32i8_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
199199
;
200200
; AVX512-LABEL: concat_icmp_v32i8_v16i8:
201201
; AVX512: # %bb.0:
202-
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
203-
; AVX512-NEXT: vpcmpgtb %xmm2, %xmm0, %k0
204-
; AVX512-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
205-
; AVX512-NEXT: kunpckwd %k0, %k1, %k0
202+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
203+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
204+
; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
206205
; AVX512-NEXT: kmovd %k0, %eax
206+
; AVX512-NEXT: vzeroupper
207207
; AVX512-NEXT: retq
208208
%v0 = icmp sgt <16 x i8> %a0, splat (i8 5)
209209
%v1 = icmp sgt <16 x i8> %a1, splat (i8 5)
@@ -329,21 +329,15 @@ define i8 @concat_icmp_v8i64_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2,
329329
;
330330
; AVX512-LABEL: concat_icmp_v8i64_v2i64:
331331
; AVX512: # %bb.0:
332-
; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [128,128]
333-
; AVX512-NEXT: vpcmpltuq %xmm4, %xmm0, %k0
334-
; AVX512-NEXT: vpcmpltuq %xmm4, %xmm1, %k1
335-
; AVX512-NEXT: vpcmpltuq %xmm4, %xmm2, %k2
336-
; AVX512-NEXT: vpcmpltuq %xmm4, %xmm3, %k3
337-
; AVX512-NEXT: kshiftlb $2, %k3, %k3
338-
; AVX512-NEXT: korb %k3, %k2, %k2
339-
; AVX512-NEXT: kshiftlb $4, %k2, %k2
340-
; AVX512-NEXT: kshiftlb $2, %k1, %k1
341-
; AVX512-NEXT: korw %k1, %k0, %k0
342-
; AVX512-NEXT: kshiftlb $4, %k0, %k0
343-
; AVX512-NEXT: kshiftrb $4, %k0, %k0
344-
; AVX512-NEXT: korb %k2, %k0, %k0
332+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
333+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
334+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
335+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
336+
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
337+
; AVX512-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0
345338
; AVX512-NEXT: kmovd %k0, %eax
346339
; AVX512-NEXT: # kill: def $al killed $al killed $eax
340+
; AVX512-NEXT: vzeroupper
347341
; AVX512-NEXT: retq
348342
%v0 = icmp ult <2 x i64> %a0, splat (i64 128)
349343
%v1 = icmp ult <2 x i64> %a1, splat (i64 128)
@@ -387,18 +381,16 @@ define i16 @concat_icmp_v16i32_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2
387381
;
388382
; AVX512-LABEL: concat_icmp_v16i32_v4i32:
389383
; AVX512: # %bb.0:
390-
; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
391-
; AVX512-NEXT: vpcmpgtd %xmm4, %xmm0, %k0
392-
; AVX512-NEXT: vpcmpgtd %xmm4, %xmm1, %k1
393-
; AVX512-NEXT: vpcmpgtd %xmm4, %xmm2, %k2
394-
; AVX512-NEXT: vpcmpgtd %xmm4, %xmm3, %k3
395-
; AVX512-NEXT: kshiftlb $4, %k1, %k1
396-
; AVX512-NEXT: korb %k1, %k0, %k0
397-
; AVX512-NEXT: kshiftlb $4, %k3, %k1
398-
; AVX512-NEXT: korb %k1, %k2, %k1
399-
; AVX512-NEXT: kunpckbw %k0, %k1, %k0
384+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
385+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
386+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
387+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
388+
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
389+
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
390+
; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
400391
; AVX512-NEXT: kmovd %k0, %eax
401392
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
393+
; AVX512-NEXT: vzeroupper
402394
; AVX512-NEXT: retq
403395
%v0 = icmp sgt <4 x i32> %a0, zeroinitializer
404396
%v1 = icmp sgt <4 x i32> %a1, zeroinitializer
@@ -468,14 +460,14 @@ define i32 @concat_icmp_v32i16_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2
468460
;
469461
; AVX512-LABEL: concat_icmp_v32i16_v8i16:
470462
; AVX512: # %bb.0:
471-
; AVX512-NEXT: vptestmw %xmm0, %xmm0, %k0
472-
; AVX512-NEXT: vptestmw %xmm1, %xmm1, %k1
473-
; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k2
474-
; AVX512-NEXT: vptestmw %xmm3, %xmm3, %k3
475-
; AVX512-NEXT: kunpckbw %k0, %k1, %k0
476-
; AVX512-NEXT: kunpckbw %k2, %k3, %k1
477-
; AVX512-NEXT: kunpckwd %k0, %k1, %k0
463+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
464+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
465+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
466+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
467+
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
468+
; AVX512-NEXT: vptestmw %zmm0, %zmm0, %k0
478469
; AVX512-NEXT: kmovd %k0, %eax
470+
; AVX512-NEXT: vzeroupper
479471
; AVX512-NEXT: retq
480472
%v0 = icmp ne <8 x i16> %a0, zeroinitializer
481473
%v1 = icmp ne <8 x i16> %a1, zeroinitializer
@@ -560,15 +552,14 @@ define i64 @concat_icmp_v64i8_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2,
560552
;
561553
; AVX512-LABEL: concat_icmp_v64i8_v16i8:
562554
; AVX512: # %bb.0:
563-
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
564-
; AVX512-NEXT: vpcmpnleub %xmm4, %xmm0, %k0
565-
; AVX512-NEXT: vpcmpnleub %xmm4, %xmm1, %k1
566-
; AVX512-NEXT: vpcmpnleub %xmm4, %xmm2, %k2
567-
; AVX512-NEXT: vpcmpnleub %xmm4, %xmm3, %k3
568-
; AVX512-NEXT: kunpckwd %k0, %k1, %k0
569-
; AVX512-NEXT: kunpckwd %k2, %k3, %k1
570-
; AVX512-NEXT: kunpckdq %k0, %k1, %k0
555+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
556+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
557+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
558+
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
559+
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
560+
; AVX512-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
571561
; AVX512-NEXT: kmovq %k0, %rax
562+
; AVX512-NEXT: vzeroupper
572563
; AVX512-NEXT: retq
573564
%v0 = icmp ugt <16 x i8> %a0, splat (i8 15)
574565
%v1 = icmp ugt <16 x i8> %a1, splat (i8 15)
@@ -672,10 +663,9 @@ define i8 @concat_icmp_v8i64_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
672663
;
673664
; AVX512-LABEL: concat_icmp_v8i64_v4i64:
674665
; AVX512: # %bb.0:
675-
; AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k0
676-
; AVX512-NEXT: vptestnmq %ymm1, %ymm1, %k1
677-
; AVX512-NEXT: kshiftlb $4, %k1, %k1
678-
; AVX512-NEXT: korb %k1, %k0, %k0
666+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
667+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
668+
; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k0
679669
; AVX512-NEXT: kmovd %k0, %eax
680670
; AVX512-NEXT: # kill: def $al killed $al killed $eax
681671
; AVX512-NEXT: vzeroupper
@@ -768,10 +758,9 @@ define i16 @concat_icmp_v16i32_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
768758
;
769759
; AVX512-LABEL: concat_icmp_v16i32_v8i32:
770760
; AVX512: # %bb.0:
771-
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
772-
; AVX512-NEXT: vpcmpnleud %ymm2, %ymm0, %k0
773-
; AVX512-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
774-
; AVX512-NEXT: kunpckbw %k0, %k1, %k0
761+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
762+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
763+
; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0
775764
; AVX512-NEXT: kmovd %k0, %eax
776765
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
777766
; AVX512-NEXT: vzeroupper
@@ -830,10 +819,9 @@ define i32 @concat_icmp_v32i16_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
830819
;
831820
; AVX512-LABEL: concat_icmp_v32i16_v16i16:
832821
; AVX512: # %bb.0:
833-
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
834-
; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %k0
835-
; AVX512-NEXT: vpcmpgtw %ymm2, %ymm1, %k1
836-
; AVX512-NEXT: kunpckwd %k0, %k1, %k0
822+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
823+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
824+
; AVX512-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
837825
; AVX512-NEXT: kmovd %k0, %eax
838826
; AVX512-NEXT: vzeroupper
839827
; AVX512-NEXT: retq
@@ -903,10 +891,9 @@ define i64 @concat_icmp_v64i8_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
903891
;
904892
; AVX512-LABEL: concat_icmp_v64i8_v32i8:
905893
; AVX512: # %bb.0:
906-
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
907-
; AVX512-NEXT: vpcmpgtb %ymm0, %ymm2, %k0
908-
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm2, %k1
909-
; AVX512-NEXT: kunpckdq %k0, %k1, %k0
894+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
895+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
896+
; AVX512-NEXT: vpcmpltb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
910897
; AVX512-NEXT: kmovq %k0, %rax
911898
; AVX512-NEXT: vzeroupper
912899
; AVX512-NEXT: retq

0 commit comments

Comments
 (0)