Skip to content

Commit

Permalink
[X86] fold select to mask instructions.
Browse files Browse the repository at this point in the history
When avx512 is available the lhs operand of select instruction can be
folded with mask instruction, while the rhs operand can't. This patch is
to commute the lhs and rhs of the select instruction to create the
opportunity of folding.

Differential Revision: https://reviews.llvm.org/D151535
  • Loading branch information
LuoYuanke committed May 26, 2023
1 parent fd89df1 commit 969c686
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 100 deletions.
41 changes: 41 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -20029,6 +20029,9 @@ static bool canCombineAsMaskOperation(SDValue V,
if (!Subtarget.hasAVX512())
return false;

if (!V.getValueType().isSimple())
return false;

MVT VT = V.getSimpleValueType().getScalarType();
if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
return false;
Expand Down Expand Up @@ -46724,6 +46727,37 @@ static SDValue combineLogicBlendIntoConditionalNegate(
return DAG.getBitcast(VT, Res);
}

static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX512())
return SDValue();
if (N->getOpcode() != ISD::VSELECT)
return SDValue();

SDLoc DL(N);
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);

if (canCombineAsMaskOperation(LHS, Subtarget))
return SDValue();

if (!canCombineAsMaskOperation(RHS, Subtarget))
return SDValue();

if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
return SDValue();

// Commute LHS and RHS to create opportunity to select mask instruction.
// (vselect M, L, R) -> (vselect ~M, R, L)
ISD::CondCode NewCC =
ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
Cond.getOperand(0).getValueType());
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
Cond.getOperand(1), NewCC);
return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
}

/// Do target-specific dag combines on SELECT and VSELECT nodes.
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
Expand All @@ -46738,6 +46772,13 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
return V;

// When avx512 is available the lhs operand of select instruction can be
// folded with mask instruction, while the rhs operand can't. Commute the
// lhs and rhs of the select instruction to create the opportunity of
// folding.
if (SDValue V = commuteSelect(N, DAG, Subtarget))
return V;

EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll
Expand Up @@ -220,9 +220,9 @@ define <2 x i64> @setcc_commute(<2 x i64> %a) {
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpsubq %xmm0, %xmm1, %xmm1
; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
%1 = sub <2 x i64> zeroinitializer, %a
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/combine-rotates.ll
Expand Up @@ -163,10 +163,8 @@ define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) {
;
; AVX512-LABEL: combine_vec_rot_select_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1}
; AVX512-NEXT: vmovdqa %xmm2, %xmm0
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: retq
%3 = and <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
%4 = shl <4 x i32> %0, %3
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/X86/paddus.ll
Expand Up @@ -450,10 +450,9 @@ define <64 x i8> @test13(<64 x i8> %x) {
;
; AVX512-LABEL: test13:
; AVX512: # %bb.0:
; AVX512-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpsubb %zmm2, %zmm0, %zmm1
; AVX512-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = add <64 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Expand Down Expand Up @@ -1203,10 +1202,9 @@ define <32 x i16> @test31(<32 x i16> %x) {
;
; AVX512-LABEL: test31:
; AVX512: # %bb.0:
; AVX512-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpsubw %zmm2, %zmm0, %zmm1
; AVX512-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512-NEXT: vpcmpneqw %zmm1, %zmm0, %k1
; AVX512-NEXT: vpsubw %zmm1, %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = add <32 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/X86/sat-add.ll
Expand Up @@ -1120,11 +1120,11 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4
;
; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovdqa32 %xmm3, %xmm2 {%k1}
; AVX512-NEXT: vmovdqa %xmm1, %xmm3
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpleud %xmm3, %xmm0, %k1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1}
; AVX512-NEXT: vmovdqa %xmm2, %xmm0
; AVX512-NEXT: retq
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
Expand Down Expand Up @@ -1343,11 +1343,11 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2
;
; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovdqa64 %xmm3, %xmm2 {%k1}
; AVX512-NEXT: vmovdqa %xmm1, %xmm3
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpleuq %xmm3, %xmm0, %k1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2 {%k1}
; AVX512-NEXT: vmovdqa %xmm2, %xmm0
; AVX512-NEXT: retq
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
Expand Down
102 changes: 39 additions & 63 deletions llvm/test/CodeGen/X86/vector-bo-select-avx512.ll
Expand Up @@ -6,10 +6,8 @@ define dso_local <8 x i64> @select_sub(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_sub:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsubq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsubq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -25,10 +23,8 @@ define dso_local <8 x i64> @select_add(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_add:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -44,10 +40,8 @@ define dso_local <8 x i64> @select_and(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_and:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -63,10 +57,8 @@ define dso_local <8 x i64> @select_xor(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_xor:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -82,10 +74,8 @@ define dso_local <8 x i64> @select_shl(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_shl:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsllvq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsllvq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -101,10 +91,8 @@ define dso_local <8 x i64> @select_srl(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_srl:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsrlvq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -120,10 +108,8 @@ define dso_local <8 x i64> @select_sra(<8 x i64> %src, <8 x i64> %a, <8 x i64> %
; AVX512-LABEL: select_sra:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsravq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
; AVX512-NEXT: vpsravq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
Expand All @@ -140,19 +126,17 @@ define dso_local <8 x i32> @select_mul(<8 x i32> %src, <8 x i32> %a, <8 x i32> %
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vpmulld %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: select_mul:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
Expand All @@ -169,19 +153,17 @@ define dso_local <8 x i32> @select_smax(<8 x i32> %src, <8 x i32> %a, <8 x i32>
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: select_smax:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmaxsd %ymm2, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
Expand All @@ -199,19 +181,17 @@ define dso_local <8 x i32> @select_smin(<8 x i32> %src, <8 x i32> %a, <8 x i32>
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vpminsd %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: select_smin:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpminsd %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpminsd %ymm2, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
Expand All @@ -229,19 +209,17 @@ define dso_local <8 x i32> @select_umax(<8 x i32> %src, <8 x i32> %a, <8 x i32>
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vpmaxud %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: select_umax:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmaxud %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpmaxud %ymm2, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
Expand All @@ -259,19 +237,17 @@ define dso_local <8 x i32> @select_umin(<8 x i32> %src, <8 x i32> %a, <8 x i32>
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
; AVX512F-NEXT: vpminud %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: select_umin:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpminud %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
; AVX512VL-NEXT: vpminud %ymm2, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
entry:
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
Expand Down

0 comments on commit 969c686

Please sign in to comment.