diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index f3e036ed1b947..3be957378286f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8582,6 +8582,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) { LLT DstTy = MRI.getType(Dst); LLT SrcTy = MRI.getType(Cmp->getReg(1)); LLT CmpTy = DstTy.changeElementSize(1); + auto LHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getLHSReg()); + auto RHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getRHSReg()); CmpInst::Predicate LTPredicate = Cmp->isSigned() ? CmpInst::Predicate::ICMP_SLT @@ -8591,10 +8593,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) { : CmpInst::Predicate::ICMP_UGT; auto Zero = MIRBuilder.buildConstant(DstTy, 0); - auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(), - Cmp->getRHSReg()); - auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(), - Cmp->getRHSReg()); + auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, LHS, RHS); + auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, LHS, RHS); auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 80500e48351e4..02f85cfc9262e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10956,8 +10956,8 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const { SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const { unsigned Opcode = Node->getOpcode(); - SDValue LHS = Node->getOperand(0); - SDValue RHS = Node->getOperand(1); + SDValue LHS = DAG.getFreeze(Node->getOperand(0)); + SDValue RHS = DAG.getFreeze(Node->getOperand(1)); EVT VT = LHS.getValueType(); EVT ResVT = Node->getValueType(0); EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9052cbfa89deb..01ab006d288fa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -10479,6 +10479,9 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { // Special case for Thumb1 UCMP only if (!IsSigned && Subtarget->isThumb1Only()) { + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); + // For Thumb unsigned comparison, use this sequence: // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags // sbc r2, r2 ; r2 = r2 - r2 - !carry @@ -10511,10 +10514,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { // Final subtraction: Sbc1Result - Sbc2Result (no flags needed) SDValue Result = DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result); - if (Op.getValueType() != MVT::i32) - Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); - - return Result; + return DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); } // For the ARM assembly pattern: @@ -10582,10 +10582,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, LTCondValue, Flags); - if (Op.getValueType() != MVT::i32) - Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); - - return Result2; + return DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2907303874de5..932c224561033 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -585,6 +585,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + // Custom handling for PowerPC ucmp instruction + setOperationAction(ISD::UCMP, MVT::i32, Custom); + setOperationAction(ISD::UCMP, MVT::i64, isPPC64 ? Custom : Expand); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no @@ -12618,6 +12622,33 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues({Sub, OverflowTrunc}, dl); } +// Lower unsigned 3-way compare producing -1/0/1. +SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue A = DAG.getFreeze(Op.getOperand(0)); + SDValue B = DAG.getFreeze(Op.getOperand(1)); + EVT OpVT = A.getValueType(); // operand type + EVT ResVT = Op.getValueType(); // result type + + // First compute diff = A - B (will become subf). + SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, A, B); + + // Generate B - A using SUBC to capture carry. + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + SDValue SubC = DAG.getNode(PPCISD::SUBC, DL, VTs, B, A); + SDValue CA0 = SubC.getValue(1); + + // t2 = A - B + CA0 using SUBE. + SDValue SubE1 = DAG.getNode(PPCISD::SUBE, DL, VTs, A, B, CA0); + SDValue CA1 = SubE1.getValue(1); + + // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1). + SDValue ResPair = DAG.getNode(PPCISD::SUBE, DL, VTs, Diff, SubE1, CA1); + + // Extract the first result and truncate to result type if needed + return DAG.getSExtOrTrunc(ResPair.getValue(0), DL, ResVT); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -12722,6 +12753,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDO_CARRY: case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG); + case ISD::UCMP: + return LowerUCMP(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 669430550f4e6..b82533fac2eb8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1343,6 +1343,7 @@ namespace llvm { SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUCMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir index ae16e40671785..e1c63005ee9d2 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir @@ -7,8 +7,10 @@ body: | ; CHECK-LABEL: name: test_scmp ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[FREEZE]](s64), [[FREEZE1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[FREEZE]](s64), [[FREEZE1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]] @@ -30,8 +32,10 @@ body: | ; CHECK-LABEL: name: test_ucmp ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE1]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]] @@ -61,8 +65,10 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $w3 ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[FREEZE]](<4 x s32>), [[FREEZE1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[FREEZE]](<4 x s32>), [[FREEZE1]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>) ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s16>) = G_SUB [[TRUNC]], [[TRUNC1]] @@ -92,13 +98,17 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]] + ; CHECK-NEXT: [[FREEZE2:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: [[FREEZE3:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE2]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE1]](s64), [[FREEZE3]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]] - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE2]] + ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE1]](s64), [[FREEZE3]] + ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]] ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP3]], [[ICMP4]] ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index fae3bbe2dcfba..cdc7fc12404ca 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -522,16 +522,28 @@ define i32 @freeze_scmp(i32 %a0) nounwind { } define i32 @freeze_ucmp(i32 %a0) nounwind { -; CHECK-LABEL: freeze_ucmp: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #2 // =0x2 -; CHECK-NEXT: cmp w8, w0 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w8, w8, wzr, hs -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: csinv w0, w8, wzr, hs -; CHECK-NEXT: ret +; CHECK-SD-LABEL: freeze_ucmp: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #2 // =0x2 +; CHECK-SD-NEXT: cmp w8, w0 +; CHECK-SD-NEXT: cset w8, hi +; CHECK-SD-NEXT: csinv w8, w8, wzr, hs +; CHECK-SD-NEXT: cmp w8, #1 +; CHECK-SD-NEXT: cset w8, hi +; CHECK-SD-NEXT: csinv w0, w8, wzr, hs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_ucmp: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mov w9, #1 // =0x1 +; CHECK-GI-NEXT: cmp w8, w0 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: csinv w8, w8, wzr, hs +; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: csinv w0, w8, wzr, hs +; CHECK-GI-NEXT: ret %x = call i32 @llvm.ucmp.i32(i32 2, i32 %a0) %y = freeze i32 %x %z = call i32 @llvm.ucmp.i32(i32 %y, i32 1) diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll index af8225307fedd..6b5bcfa400230 100644 --- a/llvm/test/CodeGen/AArch64/ucmp.ll +++ b/llvm/test/CodeGen/AArch64/ucmp.ll @@ -13,8 +13,8 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind { ; ; CHECK-GI-LABEL: ucmp.8.8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and w8, w0, #0xff -; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: uxtb w8, w0 +; CHECK-GI-NEXT: uxtb w9, w1 ; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: cset w8, hi ; CHECK-GI-NEXT: csinv w0, w8, wzr, hs @@ -34,8 +34,8 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind { ; ; CHECK-GI-LABEL: ucmp.8.16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and w8, w0, #0xffff -; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: uxth w8, w0 +; CHECK-GI-NEXT: uxth w9, w1 ; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: cset w8, hi ; CHECK-GI-NEXT: csinv w0, w8, wzr, hs diff --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll index 9189aee6aaf43..07a08f46ee1ca 100644 --- a/llvm/test/CodeGen/ARM/scmp.ll +++ b/llvm/test/CodeGen/ARM/scmp.ll @@ -58,23 +58,23 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: ldr r4, [sp, #24] -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: ldr r6, [sp, #28] -; CHECK-NEXT: subs r7, r0, r4 -; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: sbcs r7, r1, r6 -; CHECK-NEXT: ldr lr, [sp, #36] -; CHECK-NEXT: sbcs r7, r2, r12 -; CHECK-NEXT: sbcs r7, r3, lr +; CHECK-NEXT: ldr r5, [sp, #24] +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: subs r7, r0, r5 +; CHECK-NEXT: ldr lr, [sp, #32] +; CHECK-NEXT: sbcs r7, r1, r4 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: sbcs r7, r2, lr +; CHECK-NEXT: sbcs r7, r3, r12 ; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r6, r1 -; CHECK-NEXT: sbcs r0, r12, r2 -; CHECK-NEXT: sbcs r0, lr, r3 -; CHECK-NEXT: movwlt r5, #1 -; CHECK-NEXT: sub r0, r5, r7 +; CHECK-NEXT: subs r0, r5, r0 +; CHECK-NEXT: sbcs r0, r4, r1 +; CHECK-NEXT: sbcs r0, lr, r2 +; CHECK-NEXT: sbcs r0, r12, r3 +; CHECK-NEXT: movwlt r6, #1 +; CHECK-NEXT: sub r0, r6, r7 ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} %1 = call i8 @llvm.scmp(i128 %x, i128 %y) ret i8 %1 diff --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll index bb0201454d1ea..a15cc4cca0d39 100644 --- a/llvm/test/CodeGen/ARM/ucmp.ll +++ b/llvm/test/CodeGen/ARM/ucmp.ll @@ -58,23 +58,23 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: ldr r4, [sp, #24] -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: ldr r6, [sp, #28] -; CHECK-NEXT: subs r7, r0, r4 -; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: sbcs r7, r1, r6 -; CHECK-NEXT: ldr lr, [sp, #36] -; CHECK-NEXT: sbcs r7, r2, r12 -; CHECK-NEXT: sbcs r7, r3, lr +; CHECK-NEXT: ldr r5, [sp, #24] +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: subs r7, r0, r5 +; CHECK-NEXT: ldr lr, [sp, #32] +; CHECK-NEXT: sbcs r7, r1, r4 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: sbcs r7, r2, lr +; CHECK-NEXT: sbcs r7, r3, r12 ; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: movwlo r7, #1 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r6, r1 -; CHECK-NEXT: sbcs r0, r12, r2 -; CHECK-NEXT: sbcs r0, lr, r3 -; CHECK-NEXT: movwlo r5, #1 -; CHECK-NEXT: sub r0, r5, r7 +; CHECK-NEXT: subs r0, r5, r0 +; CHECK-NEXT: sbcs r0, r4, r1 +; CHECK-NEXT: sbcs r0, lr, r2 +; CHECK-NEXT: sbcs r0, r12, r3 +; CHECK-NEXT: movwlo r6, #1 +; CHECK-NEXT: sub r0, r6, r7 ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) ret i8 %1 diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll index 39f9269997315..4998d87cf397b 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp.ll @@ -6,12 +6,10 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: ldbrx 3, 0, 3 ; CHECK-NEXT: ldbrx 4, 0, 4 -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8) @@ -23,11 +21,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: lwbrx 3, 0, 3 ; CHECK-NEXT: lwbrx 4, 0, 4 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) ret i32 %call diff --git a/llvm/test/CodeGen/PowerPC/scmp.ll b/llvm/test/CodeGen/PowerPC/scmp.ll index 107137c0bea7c..e2d9886184654 100644 --- a/llvm/test/CodeGen/PowerPC/scmp.ll +++ b/llvm/test/CodeGen/PowerPC/scmp.ll @@ -5,10 +5,10 @@ define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind { ; CHECK-LABEL: scmp_8_8: ; CHECK: # %bb.0: ; CHECK-NEXT: cmpw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: sub 3, 4, 3 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: isellt 3, 4, 3 ; CHECK-NEXT: blr %1 = call i8 @llvm.scmp(i8 %x, i8 %y) ret i8 %1 @@ -18,10 +18,10 @@ define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind { ; CHECK-LABEL: scmp_8_16: ; CHECK: # %bb.0: ; CHECK-NEXT: cmpw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: sub 3, 4, 3 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: isellt 3, 4, 3 ; CHECK-NEXT: blr %1 = call i8 @llvm.scmp(i16 %x, i16 %y) ret i8 %1 diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll index d2dff6e7e05c8..4d393dd00e3db 100644 --- a/llvm/test/CodeGen/PowerPC/ucmp.ll +++ b/llvm/test/CodeGen/PowerPC/ucmp.ll @@ -4,12 +4,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { ; CHECK-LABEL: ucmp_8_8: ; CHECK: # %bb.0: -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) ret i8 %1 @@ -18,12 +16,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { ; CHECK-LABEL: ucmp_8_16: ; CHECK: # %bb.0: -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) ret i8 %1 @@ -32,14 +28,10 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ucmp_8_32: ; CHECK: # %bb.0: -; CHECK-NEXT: clrldi 5, 4, 32 -; CHECK-NEXT: clrldi 6, 3, 32 -; CHECK-NEXT: sub 5, 5, 6 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) ret i8 %1 @@ -48,12 +40,10 @@ define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_8_64: ; CHECK: # %bb.0: -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) ret i8 %1 @@ -82,14 +72,10 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ucmp_32_32: ; CHECK: # %bb.0: -; CHECK-NEXT: clrldi 5, 4, 32 -; CHECK-NEXT: clrldi 6, 3, 32 -; CHECK-NEXT: sub 5, 5, 6 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) ret i32 %1 @@ -98,12 +84,10 @@ define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_32_64: ; CHECK: # %bb.0: -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) ret i32 %1 @@ -112,12 +96,10 @@ define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_64_64: ; CHECK: # %bb.0: -; CHECK-NEXT: subc 5, 4, 3 -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: subfe 5, 4, 4 -; CHECK-NEXT: neg 5, 5 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv32.mir index 4ffca796a4c20..3076de4376cef 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv32.mir @@ -12,8 +12,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[FREEZE]](s32), [[FREEZE1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[FREEZE]](s32), [[FREEZE1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[ICMP]], [[ICMP1]] ; CHECK-NEXT: $x10 = COPY [[SUB]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 @@ -36,8 +38,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[FREEZE]](s32), [[FREEZE1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[FREEZE]](s32), [[FREEZE1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[ICMP]], [[ICMP1]] ; CHECK-NEXT: $x10 = COPY [[SUB]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv64.mir index 9e60a767c55fe..faf1e346656b3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-threeway-cmp-rv64.mir @@ -11,14 +11,18 @@ body: | ; CHECK: liveins: $x10, $x11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[SEXT_INREG]](s64), [[SEXT_INREG1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SEXT_INREG]](s64), [[SEXT_INREG1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[TRUNC]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[TRUNC1]] + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[FREEZE]](s32) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[FREEZE1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[SEXT]](s64), [[SEXT1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SEXT]](s64), [[SEXT1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32 - ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32 + ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %2:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %2(s64) @@ -40,14 +44,18 @@ body: | ; CHECK: liveins: $x10, $x11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32 - ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[SEXT_INREG]](s64), [[SEXT_INREG1]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SEXT_INREG]](s64), [[SEXT_INREG1]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[TRUNC]] + ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[TRUNC1]] + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[FREEZE]](s32) + ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[FREEZE1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[SEXT]](s64), [[SEXT1]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SEXT]](s64), [[SEXT1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32 - ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32 + ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 %2:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %2(s64) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll index daeb2e69c83bd..c15604bb1aadc 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll @@ -5,6 +5,10 @@ define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { ; RV32I-LABEL: scmp.8.8: ; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai a0, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 ; RV32I-NEXT: slt a2, a1, a0 ; RV32I-NEXT: slt a0, a0, a1 ; RV32I-NEXT: sub a0, a2, a0 @@ -12,6 +16,10 @@ define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { ; ; RV64I-LABEL: scmp.8.8: ; RV64I: # %bb.0: +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a0, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 ; RV64I-NEXT: slt a2, a1, a0 ; RV64I-NEXT: slt a0, a0, a1 ; RV64I-NEXT: sub a0, a2, a0 @@ -23,6 +31,10 @@ define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { ; RV32I-LABEL: scmp.8.16: ; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 ; RV32I-NEXT: slt a2, a1, a0 ; RV32I-NEXT: slt a0, a0, a1 ; RV32I-NEXT: sub a0, a2, a0 @@ -30,6 +42,10 @@ define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { ; ; RV64I-LABEL: scmp.8.16: ; RV64I: # %bb.0: +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 ; RV64I-NEXT: slt a2, a1, a0 ; RV64I-NEXT: slt a0, a0, a1 ; RV64I-NEXT: sub a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll index 463883b371caf..527480541c923 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll @@ -5,6 +5,8 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { ; RV32I-LABEL: ucmp.8.8: ; RV32I: # %bb.0: +; RV32I-NEXT: zext.b a0, a0 +; RV32I-NEXT: zext.b a1, a1 ; RV32I-NEXT: sltu a2, a1, a0 ; RV32I-NEXT: sltu a0, a0, a1 ; RV32I-NEXT: sub a0, a2, a0 @@ -12,6 +14,8 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { ; ; RV64I-LABEL: ucmp.8.8: ; RV64I: # %bb.0: +; RV64I-NEXT: zext.b a0, a0 +; RV64I-NEXT: zext.b a1, a1 ; RV64I-NEXT: sltu a2, a1, a0 ; RV64I-NEXT: sltu a0, a0, a1 ; RV64I-NEXT: sub a0, a2, a0 @@ -23,6 +27,10 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { ; RV32I-LABEL: ucmp.8.16: ; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: srli a1, a1, 16 ; RV32I-NEXT: sltu a2, a1, a0 ; RV32I-NEXT: sltu a0, a0, a1 ; RV32I-NEXT: sub a0, a2, a0 @@ -30,6 +38,10 @@ define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { ; ; RV64I-LABEL: ucmp.8.16: ; RV64I: # %bb.0: +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srli a0, a0, 48 +; RV64I-NEXT: srli a1, a1, 48 ; RV64I-NEXT: sltu a2, a1, a0 ; RV64I-NEXT: sltu a0, a0, a1 ; RV64I-NEXT: sub a0, a2, a0 @@ -113,6 +125,8 @@ define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind { ; ; RV64I-LABEL: ucmp.32.32_sext: ; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: sext.w a1, a1 ; RV64I-NEXT: sltu a2, a1, a0 ; RV64I-NEXT: sltu a0, a0, a1 ; RV64I-NEXT: subw a0, a2, a0 diff --git a/llvm/test/CodeGen/SystemZ/ucmp.ll b/llvm/test/CodeGen/SystemZ/ucmp.ll index 4175cd7850a98..786f5610c2d1f 100644 --- a/llvm/test/CodeGen/SystemZ/ucmp.ll +++ b/llvm/test/CodeGen/SystemZ/ucmp.ll @@ -4,7 +4,7 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { ; CHECK-LABEL: ucmp.8.8: ; CHECK: # %bb.0: -; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: clr %r2, %r3 ; CHECK-NEXT: lhi %r2, 0 ; CHECK-NEXT: lochih %r2, 1 ; CHECK-NEXT: lochil %r2, -1 @@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { ; CHECK-LABEL: ucmp.8.16: ; CHECK: # %bb.0: -; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: clr %r2, %r3 ; CHECK-NEXT: lhi %r2, 0 ; CHECK-NEXT: lochih %r2, 1 ; CHECK-NEXT: lochil %r2, -1 diff --git a/llvm/test/CodeGen/Thumb/scmp.ll b/llvm/test/CodeGen/Thumb/scmp.ll index c0024492b3a6d..cf73771d3c426 100644 --- a/llvm/test/CodeGen/Thumb/scmp.ll +++ b/llvm/test/CodeGen/Thumb/scmp.ll @@ -184,44 +184,50 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind { ; THUMB1: @ %bb.0: ; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} ; THUMB1-NEXT: push {r4, r5, r6, r7, lr} -; THUMB1-NEXT: .pad #20 -; THUMB1-NEXT: sub sp, #20 -; THUMB1-NEXT: str r3, [sp, #16] @ 4-byte Spill -; THUMB1-NEXT: movs r3, #1 -; THUMB1-NEXT: str r3, [sp] @ 4-byte Spill -; THUMB1-NEXT: movs r3, #0 -; THUMB1-NEXT: str r3, [sp, #12] @ 4-byte Spill -; THUMB1-NEXT: ldr r6, [sp, #52] -; THUMB1-NEXT: add r7, sp, #40 -; THUMB1-NEXT: ldm r7, {r3, r5, r7} -; THUMB1-NEXT: subs r4, r0, r3 -; THUMB1-NEXT: str r1, [sp, #4] @ 4-byte Spill +; THUMB1-NEXT: .pad #36 +; THUMB1-NEXT: sub sp, #36 +; THUMB1-NEXT: ldr r4, [sp, #68] +; THUMB1-NEXT: str r4, [sp, #8] @ 4-byte Spill +; THUMB1-NEXT: add r7, sp, #56 +; THUMB1-NEXT: ldm r7, {r5, r6, r7} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: str r4, [sp, #4] @ 4-byte Spill +; THUMB1-NEXT: movs r4, #0 +; THUMB1-NEXT: str r4, [sp, #24] @ 4-byte Spill +; THUMB1-NEXT: str r0, [sp, #16] @ 4-byte Spill +; THUMB1-NEXT: str r5, [sp, #12] @ 4-byte Spill +; THUMB1-NEXT: subs r4, r0, r5 +; THUMB1-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; THUMB1-NEXT: str r1, [sp, #20] @ 4-byte Spill ; THUMB1-NEXT: mov r4, r1 -; THUMB1-NEXT: ldr r1, [sp] @ 4-byte Reload -; THUMB1-NEXT: sbcs r4, r5 -; THUMB1-NEXT: str r2, [sp, #8] @ 4-byte Spill +; THUMB1-NEXT: sbcs r4, r6 +; THUMB1-NEXT: str r2, [sp, #28] @ 4-byte Spill ; THUMB1-NEXT: mov r4, r2 +; THUMB1-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; THUMB1-NEXT: sbcs r4, r7 -; THUMB1-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; THUMB1-NEXT: sbcs r4, r6 -; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: str r3, [sp, #32] @ 4-byte Spill +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: sbcs r4, r5 +; THUMB1-NEXT: mov r1, r2 ; THUMB1-NEXT: blt .LBB4_2 ; THUMB1-NEXT: @ %bb.1: -; THUMB1-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; THUMB1-NEXT: .LBB4_2: -; THUMB1-NEXT: subs r0, r3, r0 -; THUMB1-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; THUMB1-NEXT: sbcs r5, r0 -; THUMB1-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; THUMB1-NEXT: sbcs r7, r0 ; THUMB1-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; THUMB1-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: subs r0, r3, r0 +; THUMB1-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; THUMB1-NEXT: sbcs r6, r0 +; THUMB1-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; THUMB1-NEXT: sbcs r7, r0 +; THUMB1-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; THUMB1-NEXT: sbcs r5, r0 ; THUMB1-NEXT: blt .LBB4_4 ; THUMB1-NEXT: @ %bb.3: -; THUMB1-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: ldr r2, [sp, #24] @ 4-byte Reload ; THUMB1-NEXT: .LBB4_4: -; THUMB1-NEXT: subs r0, r1, r2 -; THUMB1-NEXT: add sp, #20 +; THUMB1-NEXT: subs r0, r2, r1 +; THUMB1-NEXT: add sp, #36 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; ; THUMB2-LABEL: scmp_8_128: diff --git a/llvm/test/CodeGen/Thumb/ucmp.ll b/llvm/test/CodeGen/Thumb/ucmp.ll index 5d0f57e2a9d72..e10a162ed0474 100644 --- a/llvm/test/CodeGen/Thumb/ucmp.ll +++ b/llvm/test/CodeGen/Thumb/ucmp.ll @@ -151,44 +151,50 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { ; THUMB1: @ %bb.0: ; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} ; THUMB1-NEXT: push {r4, r5, r6, r7, lr} -; THUMB1-NEXT: .pad #20 -; THUMB1-NEXT: sub sp, #20 -; THUMB1-NEXT: str r3, [sp, #16] @ 4-byte Spill -; THUMB1-NEXT: movs r3, #1 -; THUMB1-NEXT: str r3, [sp] @ 4-byte Spill -; THUMB1-NEXT: movs r3, #0 -; THUMB1-NEXT: str r3, [sp, #12] @ 4-byte Spill -; THUMB1-NEXT: ldr r6, [sp, #52] -; THUMB1-NEXT: add r7, sp, #40 -; THUMB1-NEXT: ldm r7, {r3, r5, r7} -; THUMB1-NEXT: subs r4, r0, r3 -; THUMB1-NEXT: str r1, [sp, #4] @ 4-byte Spill +; THUMB1-NEXT: .pad #36 +; THUMB1-NEXT: sub sp, #36 +; THUMB1-NEXT: ldr r4, [sp, #68] +; THUMB1-NEXT: str r4, [sp, #8] @ 4-byte Spill +; THUMB1-NEXT: add r7, sp, #56 +; THUMB1-NEXT: ldm r7, {r5, r6, r7} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: str r4, [sp, #4] @ 4-byte Spill +; THUMB1-NEXT: movs r4, #0 +; THUMB1-NEXT: str r4, [sp, #24] @ 4-byte Spill +; THUMB1-NEXT: str r0, [sp, #16] @ 4-byte Spill +; THUMB1-NEXT: str r5, [sp, #12] @ 4-byte Spill +; THUMB1-NEXT: subs r4, r0, r5 +; THUMB1-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; THUMB1-NEXT: str r1, [sp, #20] @ 4-byte Spill ; THUMB1-NEXT: mov r4, r1 -; THUMB1-NEXT: ldr r1, [sp] @ 4-byte Reload -; THUMB1-NEXT: sbcs r4, r5 -; THUMB1-NEXT: str r2, [sp, #8] @ 4-byte Spill +; THUMB1-NEXT: sbcs r4, r6 +; THUMB1-NEXT: str r2, [sp, #28] @ 4-byte Spill ; THUMB1-NEXT: mov r4, r2 +; THUMB1-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; THUMB1-NEXT: sbcs r4, r7 -; THUMB1-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; THUMB1-NEXT: sbcs r4, r6 -; THUMB1-NEXT: mov r2, r1 +; THUMB1-NEXT: str r3, [sp, #32] @ 4-byte Spill +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: sbcs r4, r5 +; THUMB1-NEXT: mov r1, r2 ; THUMB1-NEXT: blo .LBB4_2 ; THUMB1-NEXT: @ %bb.1: -; THUMB1-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; THUMB1-NEXT: .LBB4_2: -; THUMB1-NEXT: subs r0, r3, r0 -; THUMB1-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; THUMB1-NEXT: sbcs r5, r0 -; THUMB1-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; THUMB1-NEXT: sbcs r7, r0 ; THUMB1-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; THUMB1-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: subs r0, r3, r0 +; THUMB1-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; THUMB1-NEXT: sbcs r6, r0 +; THUMB1-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; THUMB1-NEXT: sbcs r7, r0 +; THUMB1-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; THUMB1-NEXT: sbcs r5, r0 ; THUMB1-NEXT: blo .LBB4_4 ; THUMB1-NEXT: @ %bb.3: -; THUMB1-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; THUMB1-NEXT: ldr r2, [sp, #24] @ 4-byte Reload ; THUMB1-NEXT: .LBB4_4: -; THUMB1-NEXT: subs r0, r1, r2 -; THUMB1-NEXT: add sp, #20 +; THUMB1-NEXT: subs r0, r2, r1 +; THUMB1-NEXT: add sp, #36 ; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} ; ; THUMB2-LABEL: ucmp_8_128: diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 8a287229a1cb1..5a7a05d09763e 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -17,7 +17,7 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind { ; X86-LABEL: scmp.8.8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %al ; X86-NEXT: subb %cl, %al @@ -38,7 +38,7 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind { ; X86-LABEL: scmp.8.16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax +; X86-NEXT: cmpw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %al ; X86-NEXT: subb %cl, %al @@ -59,7 +59,7 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { ; X86-LABEL: scmp.8.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %al ; X86-NEXT: subb %cl, %al @@ -167,7 +167,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { ; X86-LABEL: scmp.32.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl @@ -263,7 +263,7 @@ define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind { ; X86-LABEL: scmp_narrow_result: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %al ; X86-NEXT: subb %cl, %al @@ -330,9 +330,9 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind { ; ; X86-LABEL: scmp_wide_result: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %dl ; X86-NEXT: subb %cl, %dl @@ -471,27 +471,27 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: setl %dl ; X86-NEXT: setg %dh ; X86-NEXT: subb %dl, %dh ; X86-NEXT: movsbl %dh, %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh ; X86-NEXT: movsbl %bh, %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh ; X86-NEXT: movsbl %bh, %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %ch ; X86-NEXT: subb %cl, %ch @@ -628,31 +628,31 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: setl %ch -; X86-NEXT: setg %cl -; X86-NEXT: subb %ch, %cl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: setl %ch +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: setl %dh +; X86-NEXT: setg %dl +; X86-NEXT: subb %dh, %dl +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: setl %dh ; X86-NEXT: setg %bl -; X86-NEXT: subb %ch, %bl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: setl %ch +; X86-NEXT: subb %dh, %bl +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: setl %dh ; X86-NEXT: setg %bh -; X86-NEXT: subb %ch, %bh -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; X86-NEXT: setl %dl +; X86-NEXT: subb %dh, %bh +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: setl %cl ; X86-NEXT: setg %ch -; X86-NEXT: subb %dl, %ch +; X86-NEXT: subb %cl, %ch ; X86-NEXT: movb %ch, 3(%eax) ; X86-NEXT: movb %bh, 2(%eax) ; X86-NEXT: movb %bl, 1(%eax) -; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: movb %dl, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -713,27 +713,27 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %dl ; X86-NEXT: setg %dh ; X86-NEXT: subb %dl, %dh ; X86-NEXT: movsbl %dh, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh ; X86-NEXT: movsbl %bh, %esi -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp) ; X86-NEXT: setl %ch ; X86-NEXT: setg %bl ; X86-NEXT: subb %ch, %bl ; X86-NEXT: movsbl %bl, %edi -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %ch ; X86-NEXT: subb %cl, %ch @@ -869,90 +869,90 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh +; X86-NEXT: cmpb %bh, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh +; X86-NEXT: cmpb %dh, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: cmpb %ah, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %bh ; X86-NEXT: subb %al, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %bl ; X86-NEXT: subb %al, %bl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dh ; X86-NEXT: subb %al, %dh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl ; X86-NEXT: movsbl %dl, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl ; X86-NEXT: movsbl %dl, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl ; X86-NEXT: movsbl %dl, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl ; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %ah ; X86-NEXT: subb %al, %ah ; X86-NEXT: movsbl %ah, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl @@ -999,154 +999,179 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; SSE2-LABEL: scmp_wide_vec_op: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm7, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; SSE2-NEXT: movq %xmm8, %rax ; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; SSE2-NEXT: movq %xmm7, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %xmm7, %rcx +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movq %xmm8, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movq %xmm6, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; SSE2-NEXT: movq %xmm7, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm6, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %xmm6, %rcx +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: movq %xmm7, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax ; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] +; SSE2-NEXT: movq %xmm7, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm5, %rcx -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movq %xmm7, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movq %xmm4, %rcx -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax ; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm3, %rcx ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: setl %al -; SSE2-NEXT: setg %dl -; SSE2-NEXT: subb %al, %dl -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: movzbl %dl, %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movq %xmm1, %rcx ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: cmpq %rax, %rcx ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: setl %al ; SSE2-NEXT: setg %cl ; SSE2-NEXT: subb %al, %cl ; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: setl %al +; SSE2-NEXT: setg %cl +; SSE2-NEXT: subb %al, %cl +; SSE2-NEXT: movzbl %cl, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE2-NEXT: retq ; ; SSE4-LABEL: scmp_wide_vec_op: ; SSE4: # %bb.0: +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 ; SSE4-NEXT: pextrq $1, %xmm0, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm15, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: movq %xmm0, %rcx -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx +; SSE4-NEXT: movq %xmm15, %rdx +; SSE4-NEXT: cmpq %rdx, %rcx ; SSE4-NEXT: setl %cl ; SSE4-NEXT: setg %dl ; SSE4-NEXT: subb %cl, %dl @@ -1154,98 +1179,112 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; SSE4-NEXT: movd %ecx, %xmm0 ; SSE4-NEXT: pinsrb $1, %eax, %xmm0 ; SSE4-NEXT: movq %xmm1, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm14, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $2, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm1, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm14, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $3, %eax, %xmm0 ; SSE4-NEXT: movq %xmm2, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm13, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $4, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm2, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm13, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $5, %eax, %xmm0 ; SSE4-NEXT: movq %xmm3, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm12, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $6, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm3, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm12, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $7, %eax, %xmm0 ; SSE4-NEXT: movq %xmm4, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm11, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $8, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm4, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm11, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $9, %eax, %xmm0 ; SSE4-NEXT: movq %xmm5, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm10, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $10, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm5, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm10, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $11, %eax, %xmm0 ; SSE4-NEXT: movq %xmm6, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm9, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $12, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm6, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm9, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $13, %eax, %xmm0 ; SSE4-NEXT: movq %xmm7, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq %xmm8, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl ; SSE4-NEXT: movzbl %cl, %eax ; SSE4-NEXT: pinsrb $14, %eax, %xmm0 ; SSE4-NEXT: pextrq $1, %xmm7, %rax -; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: pextrq $1, %xmm8, %rcx +; SSE4-NEXT: cmpq %rcx, %rax ; SSE4-NEXT: setl %al ; SSE4-NEXT: setg %cl ; SSE4-NEXT: subb %al, %cl @@ -1767,58 +1806,71 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addb %dil, %dil +; SSE2-NEXT: sarb %dil +; SSE2-NEXT: addb %sil, %sil +; SSE2-NEXT: sarb %sil +; SSE2-NEXT: cmpb %dil, %sil +; SSE2-NEXT: setl %sil +; SSE2-NEXT: setg %dil +; SSE2-NEXT: subb %sil, %dil +; SSE2-NEXT: movsbq %dil, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addb %bl, %bl +; SSE2-NEXT: sarb %bl +; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: addb %sil, %sil +; SSE2-NEXT: sarb %sil +; SSE2-NEXT: cmpb %bl, %sil +; SSE2-NEXT: setl %sil +; SSE2-NEXT: setg %bl +; SSE2-NEXT: subb %sil, %bl +; SSE2-NEXT: movsbq %bl, %rbx +; SSE2-NEXT: movq %rbx, %r14 +; SSE2-NEXT: sarq $63, %r14 ; SSE2-NEXT: addb %r15b, %r15b ; SSE2-NEXT: sarb %r15b +; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %esi ; SSE2-NEXT: addb %sil, %sil ; SSE2-NEXT: sarb %sil ; SSE2-NEXT: cmpb %r15b, %sil ; SSE2-NEXT: setl %sil ; SSE2-NEXT: setg %r15b ; SSE2-NEXT: subb %sil, %r15b -; SSE2-NEXT: movsbq %r15b, %rsi -; SSE2-NEXT: movq %rsi, (%rax) -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: addb %r14b, %r14b -; SSE2-NEXT: sarb %r14b -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: addb %r15b, %r15b -; SSE2-NEXT: sarb %r15b -; SSE2-NEXT: cmpb %r14b, %r15b -; SSE2-NEXT: setl %r14b -; SSE2-NEXT: setg %r15b -; SSE2-NEXT: subb %r14b, %r15b -; SSE2-NEXT: movsbq %r15b, %r14 -; SSE2-NEXT: movq %r14, %r15 -; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: addb %bpl, %bpl -; SSE2-NEXT: sarb %bpl +; SSE2-NEXT: movsbq %r15b, %r15 +; SSE2-NEXT: movq %r15, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: addb %r12b, %r12b +; SSE2-NEXT: sarb %r12b ; SSE2-NEXT: addb %dl, %dl ; SSE2-NEXT: sarb %dl -; SSE2-NEXT: cmpb %bpl, %dl +; SSE2-NEXT: cmpb %r12b, %dl ; SSE2-NEXT: setl %dl -; SSE2-NEXT: setg %bpl -; SSE2-NEXT: subb %dl, %bpl -; SSE2-NEXT: movsbq %bpl, %rdx -; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: addb %bl, %bl -; SSE2-NEXT: sarb %bl +; SSE2-NEXT: setg %sil +; SSE2-NEXT: subb %dl, %sil +; SSE2-NEXT: movsbq %sil, %rdx +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: addb %bpl, %bpl +; SSE2-NEXT: sarb %bpl ; SSE2-NEXT: addb %cl, %cl ; SSE2-NEXT: sarb %cl -; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: cmpb %bpl, %cl ; SSE2-NEXT: setl %cl -; SSE2-NEXT: setg %bl -; SSE2-NEXT: subb %cl, %bl -; SSE2-NEXT: movsbq %bl, %rbx -; SSE2-NEXT: movq %rbx, %rcx +; SSE2-NEXT: setg %bpl +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: movsbq %bpl, %r12 +; SSE2-NEXT: movq %r12, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: addb %r11b, %r11b ; SSE2-NEXT: sarb %r11b @@ -1828,9 +1880,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %r8b ; SSE2-NEXT: setg %r11b ; SSE2-NEXT: subb %r8b, %r11b -; SSE2-NEXT: movsbq %r11b, %r8 -; SSE2-NEXT: movq %r8, %r11 -; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movsbq %r11b, %rsi +; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: sarq $63, %r8 ; SSE2-NEXT: addb %r10b, %r10b ; SSE2-NEXT: sarb %r10b ; SSE2-NEXT: addb %r9b, %r9b @@ -1842,68 +1894,59 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: movsbq %r10b, %r9 ; SSE2-NEXT: movq %r9, %r10 ; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: addb %dil, %dil -; SSE2-NEXT: sarb %dil -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: addb %bpl, %bpl -; SSE2-NEXT: sarb %bpl -; SSE2-NEXT: cmpb %dil, %bpl -; SSE2-NEXT: setl %dil -; SSE2-NEXT: setg %bpl -; SSE2-NEXT: subb %dil, %bpl -; SSE2-NEXT: movsbq %bpl, %rdi -; SSE2-NEXT: movq %rdi, %r13 -; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movq %r10, %rbp +; SSE2-NEXT: shldq $20, %r9, %rbp +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero +; SSE2-NEXT: movq %r8, %r11 +; SSE2-NEXT: shldq $31, %rsi, %r11 +; SSE2-NEXT: movq %rbp, 64(%rax) +; SSE2-NEXT: movq %rcx, %rbp +; SSE2-NEXT: shldq $42, %r12, %rbp +; SSE2-NEXT: movq %r11, 48(%rax) +; SSE2-NEXT: movq %rbp, 32(%rax) +; SSE2-NEXT: movabsq $9007199254738944, %r11 # imm = 0x1FFFFFFFFFF800 +; SSE2-NEXT: andq %rdi, %r11 +; SSE2-NEXT: shldq $53, %rdx, %rdi +; SSE2-NEXT: movq %rdi, 16(%rax) ; SSE2-NEXT: movl %r13d, 96(%rax) -; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF -; SSE2-NEXT: andq %r13, %rbp -; SSE2-NEXT: shldq $62, %rdi, %r13 +; SSE2-NEXT: movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF +; SSE2-NEXT: andq %r13, %rdi +; SSE2-NEXT: shldq $62, %r15, %r13 ; SSE2-NEXT: movq %r13, 88(%rax) -; SSE2-NEXT: movq %r10, %r13 -; SSE2-NEXT: shldq $20, %r9, %r13 -; SSE2-NEXT: movq %r13, 64(%rax) -; SSE2-NEXT: movq %r11, %r13 -; SSE2-NEXT: shldq $31, %r8, %r13 -; SSE2-NEXT: movq %r13, 48(%rax) -; SSE2-NEXT: movq %rcx, %r13 -; SSE2-NEXT: shldq $42, %rbx, %r13 -; SSE2-NEXT: movq %r13, 32(%rax) -; SSE2-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800 -; SSE2-NEXT: andq %r12, %r13 -; SSE2-NEXT: shldq $53, %rdx, %r12 -; SSE2-NEXT: movq %r12, 16(%rax) -; SSE2-NEXT: movq %rbp, %r12 -; SSE2-NEXT: shrq $48, %r12 -; SSE2-NEXT: movb %r12b, 102(%rax) -; SSE2-NEXT: shrq $32, %rbp -; SSE2-NEXT: movw %bp, 100(%rax) -; SSE2-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF -; SSE2-NEXT: andq %r12, %r15 -; SSE2-NEXT: shldq $9, %r14, %r15 -; SSE2-NEXT: shlq $62, %rdi -; SSE2-NEXT: orq %r15, %rdi -; SSE2-NEXT: movq %rdi, 80(%rax) -; SSE2-NEXT: shlq $42, %rbx -; SSE2-NEXT: shrq $11, %r13 -; SSE2-NEXT: orq %rbx, %r13 -; SSE2-NEXT: movq %r13, 24(%rax) -; SSE2-NEXT: shlq $9, %r14 +; SSE2-NEXT: shlq $42, %r12 +; SSE2-NEXT: shrq $11, %r11 +; SSE2-NEXT: orq %r12, %r11 +; SSE2-NEXT: movq %r11, 24(%rax) +; SSE2-NEXT: movq %rdi, %r11 +; SSE2-NEXT: shrq $48, %r11 +; SSE2-NEXT: movb %r11b, 102(%rax) +; SSE2-NEXT: shrq $32, %rdi +; SSE2-NEXT: movw %di, 100(%rax) +; SSE2-NEXT: movabsq $9007199254740991, %rdi # imm = 0x1FFFFFFFFFFFFF +; SSE2-NEXT: andq %rdi, %r14 +; SSE2-NEXT: shldq $9, %rbx, %r14 +; SSE2-NEXT: shlq $62, %r15 +; SSE2-NEXT: orq %r14, %r15 +; SSE2-NEXT: movq %r15, 80(%rax) +; SSE2-NEXT: shlq $9, %rbx ; SSE2-NEXT: andl $511, %r10d # imm = 0x1FF -; SSE2-NEXT: orq %r14, %r10 +; SSE2-NEXT: orq %rbx, %r10 ; SSE2-NEXT: movq %r10, 72(%rax) ; SSE2-NEXT: shlq $20, %r9 -; SSE2-NEXT: andl $1048575, %r11d # imm = 0xFFFFF -; SSE2-NEXT: orq %r9, %r11 -; SSE2-NEXT: movq %r11, 56(%rax) -; SSE2-NEXT: shlq $31, %r8 +; SSE2-NEXT: andl $1048575, %r8d # imm = 0xFFFFF +; SSE2-NEXT: orq %r9, %r8 +; SSE2-NEXT: movq %r8, 56(%rax) +; SSE2-NEXT: shlq $31, %rsi ; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: movq %rcx, 40(%rax) -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: andq %r12, %rcx +; SSE2-NEXT: andq %rdi, %rcx ; SSE2-NEXT: shlq $53, %rdx ; SSE2-NEXT: orq %rcx, %rdx ; SSE2-NEXT: movq %rdx, 8(%rax) @@ -1924,140 +1967,143 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: pushq %r12 ; SSE4-NEXT: pushq %rbx ; SSE4-NEXT: movq %rdi, %rax -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE4-NEXT: addb %dil, %dil +; SSE4-NEXT: sarb %dil +; SSE4-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; SSE4-NEXT: addb %r10b, %r10b +; SSE4-NEXT: sarb %r10b +; SSE4-NEXT: cmpb %dil, %r10b +; SSE4-NEXT: setl %dil +; SSE4-NEXT: setg %r10b +; SSE4-NEXT: subb %dil, %r10b +; SSE4-NEXT: movsbq %r10b, %r13 +; SSE4-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: sarq $63, %r13 +; SSE4-NEXT: addb %r11b, %r11b +; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: addb %sil, %sil +; SSE4-NEXT: sarb %sil +; SSE4-NEXT: cmpb %r11b, %sil +; SSE4-NEXT: setl %sil +; SSE4-NEXT: setg %r11b +; SSE4-NEXT: subb %sil, %r11b +; SSE4-NEXT: movsbq %r11b, %r11 +; SSE4-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: sarq $63, %r11 ; SSE4-NEXT: addb %r14b, %r14b ; SSE4-NEXT: sarb %r14b +; SSE4-NEXT: movl {{[0-9]+}}(%rsp), %esi ; SSE4-NEXT: addb %sil, %sil ; SSE4-NEXT: sarb %sil ; SSE4-NEXT: cmpb %r14b, %sil ; SSE4-NEXT: setl %sil ; SSE4-NEXT: setg %r14b ; SSE4-NEXT: subb %sil, %r14b -; SSE4-NEXT: movsbq %r14b, %r14 -; SSE4-NEXT: movq %r14, (%rax) +; SSE4-NEXT: movsbq %r14b, %rsi +; SSE4-NEXT: movq %rsi, %r14 ; SSE4-NEXT: sarq $63, %r14 ; SSE4-NEXT: addb %r15b, %r15b ; SSE4-NEXT: sarb %r15b -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; SSE4-NEXT: addb %sil, %sil -; SSE4-NEXT: sarb %sil -; SSE4-NEXT: cmpb %r15b, %sil -; SSE4-NEXT: setl %sil -; SSE4-NEXT: setg %r15b -; SSE4-NEXT: subb %sil, %r15b -; SSE4-NEXT: movsbq %r15b, %rsi -; SSE4-NEXT: movq %rsi, %r15 -; SSE4-NEXT: sarq $63, %r15 -; SSE4-NEXT: addb %bpl, %bpl -; SSE4-NEXT: sarb %bpl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: sarb %dl -; SSE4-NEXT: cmpb %bpl, %dl +; SSE4-NEXT: cmpb %r15b, %dl ; SSE4-NEXT: setl %dl -; SSE4-NEXT: setg %bpl -; SSE4-NEXT: subb %dl, %bpl -; SSE4-NEXT: movsbq %bpl, %r12 -; SSE4-NEXT: movq %r12, %r13 -; SSE4-NEXT: sarq $63, %r13 -; SSE4-NEXT: addb %bl, %bl -; SSE4-NEXT: sarb %bl +; SSE4-NEXT: setg %r15b +; SSE4-NEXT: subb %dl, %r15b +; SSE4-NEXT: movsbq %r15b, %r15 +; SSE4-NEXT: movq %r15, %rdi +; SSE4-NEXT: sarq $63, %rdi +; SSE4-NEXT: addb %r12b, %r12b +; SSE4-NEXT: sarb %r12b ; SSE4-NEXT: addb %cl, %cl ; SSE4-NEXT: sarb %cl -; SSE4-NEXT: cmpb %bl, %cl +; SSE4-NEXT: cmpb %r12b, %cl ; SSE4-NEXT: setl %cl -; SSE4-NEXT: setg %dl -; SSE4-NEXT: subb %cl, %dl -; SSE4-NEXT: movsbq %dl, %rbx -; SSE4-NEXT: movq %rbx, %rcx +; SSE4-NEXT: setg %r12b +; SSE4-NEXT: subb %cl, %r12b +; SSE4-NEXT: movsbq %r12b, %r12 +; SSE4-NEXT: movq %r12, %rcx ; SSE4-NEXT: sarq $63, %rcx -; SSE4-NEXT: addb %r11b, %r11b -; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: addb %bpl, %bpl +; SSE4-NEXT: sarb %bpl ; SSE4-NEXT: addb %r8b, %r8b ; SSE4-NEXT: sarb %r8b -; SSE4-NEXT: cmpb %r11b, %r8b -; SSE4-NEXT: setl %dl -; SSE4-NEXT: setg %r8b -; SSE4-NEXT: subb %dl, %r8b -; SSE4-NEXT: movsbq %r8b, %rdx -; SSE4-NEXT: movq %rdx, %r8 -; SSE4-NEXT: sarq $63, %r8 -; SSE4-NEXT: addb %r10b, %r10b -; SSE4-NEXT: sarb %r10b +; SSE4-NEXT: cmpb %bpl, %r8b +; SSE4-NEXT: setl %r8b +; SSE4-NEXT: setg %bpl +; SSE4-NEXT: subb %r8b, %bpl +; SSE4-NEXT: movsbq %bpl, %r10 +; SSE4-NEXT: movq %r10, %rbp +; SSE4-NEXT: sarq $63, %rbp +; SSE4-NEXT: addb %bl, %bl +; SSE4-NEXT: sarb %bl ; SSE4-NEXT: addb %r9b, %r9b ; SSE4-NEXT: sarb %r9b -; SSE4-NEXT: cmpb %r10b, %r9b +; SSE4-NEXT: cmpb %bl, %r9b ; SSE4-NEXT: setl %r9b -; SSE4-NEXT: setg %r10b -; SSE4-NEXT: subb %r9b, %r10b -; SSE4-NEXT: movsbq %r10b, %r9 -; SSE4-NEXT: movq %r9, %r10 -; SSE4-NEXT: sarq $63, %r10 -; SSE4-NEXT: addb %dil, %dil -; SSE4-NEXT: sarb %dil -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE4-NEXT: addb %r11b, %r11b -; SSE4-NEXT: sarb %r11b -; SSE4-NEXT: cmpb %dil, %r11b -; SSE4-NEXT: setl %dil -; SSE4-NEXT: setg %r11b -; SSE4-NEXT: subb %dil, %r11b -; SSE4-NEXT: movsbq %r11b, %rdi -; SSE4-NEXT: movq %rdi, %rbp -; SSE4-NEXT: sarq $63, %rbp -; SSE4-NEXT: movl %ebp, 96(%rax) -; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF -; SSE4-NEXT: andq %rbp, %r11 -; SSE4-NEXT: shldq $62, %rdi, %rbp -; SSE4-NEXT: movq %rbp, 88(%rax) -; SSE4-NEXT: movq %r10, %rbp -; SSE4-NEXT: shldq $20, %r9, %rbp -; SSE4-NEXT: movq %rbp, 64(%rax) -; SSE4-NEXT: movq %r8, %rbp -; SSE4-NEXT: shldq $31, %rdx, %rbp -; SSE4-NEXT: movq %rbp, 48(%rax) -; SSE4-NEXT: movq %rcx, %rbp -; SSE4-NEXT: shldq $42, %rbx, %rbp -; SSE4-NEXT: movq %rbp, 32(%rax) -; SSE4-NEXT: movabsq $9007199254738944, %rbp # imm = 0x1FFFFFFFFFF800 -; SSE4-NEXT: andq %r13, %rbp -; SSE4-NEXT: shldq $53, %r12, %r13 -; SSE4-NEXT: movq %r13, 16(%rax) -; SSE4-NEXT: movq %r11, %r13 -; SSE4-NEXT: shrq $48, %r13 -; SSE4-NEXT: movb %r13b, 102(%rax) -; SSE4-NEXT: shrq $32, %r11 -; SSE4-NEXT: movw %r11w, 100(%rax) -; SSE4-NEXT: movabsq $9007199254740991, %r11 # imm = 0x1FFFFFFFFFFFFF -; SSE4-NEXT: andq %r11, %r15 -; SSE4-NEXT: shldq $9, %rsi, %r15 -; SSE4-NEXT: shlq $62, %rdi -; SSE4-NEXT: orq %r15, %rdi -; SSE4-NEXT: movq %rdi, 80(%rax) -; SSE4-NEXT: andq %r11, %r14 -; SSE4-NEXT: shlq $53, %r12 -; SSE4-NEXT: orq %r14, %r12 -; SSE4-NEXT: movq %r12, 8(%rax) -; SSE4-NEXT: shlq $42, %rbx -; SSE4-NEXT: shrq $11, %rbp -; SSE4-NEXT: orq %rbx, %rbp -; SSE4-NEXT: movq %rbp, 24(%rax) -; SSE4-NEXT: shlq $9, %rsi -; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF -; SSE4-NEXT: orq %rsi, %r10 -; SSE4-NEXT: movq %r10, 72(%rax) -; SSE4-NEXT: shlq $20, %r9 -; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF -; SSE4-NEXT: orq %r9, %r8 -; SSE4-NEXT: movq %r8, 56(%rax) -; SSE4-NEXT: shlq $31, %rdx +; SSE4-NEXT: setg %bl +; SSE4-NEXT: subb %r9b, %bl +; SSE4-NEXT: movsbq %bl, %rdx +; SSE4-NEXT: movq %rdx, %r9 +; SSE4-NEXT: sarq $63, %r9 +; SSE4-NEXT: movq %r9, %rbx +; SSE4-NEXT: shldq $20, %rdx, %rbx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE4-NEXT: movq %r8, (%rax) +; SSE4-NEXT: movq %rbp, %r8 +; SSE4-NEXT: shldq $31, %r10, %r8 +; SSE4-NEXT: movq %rbx, 64(%rax) +; SSE4-NEXT: movq %rcx, %rbx +; SSE4-NEXT: shldq $42, %r12, %rbx +; SSE4-NEXT: movq %r8, 48(%rax) +; SSE4-NEXT: movq %rbx, 32(%rax) +; SSE4-NEXT: movabsq $9007199254738944, %r8 # imm = 0x1FFFFFFFFFF800 +; SSE4-NEXT: andq %rdi, %r8 +; SSE4-NEXT: shldq $53, %r15, %rdi +; SSE4-NEXT: movq %rdi, 16(%rax) +; SSE4-NEXT: movl %r14d, 96(%rax) +; SSE4-NEXT: movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF +; SSE4-NEXT: andq %r14, %rdi +; SSE4-NEXT: shldq $62, %rsi, %r14 +; SSE4-NEXT: movq %r14, 88(%rax) +; SSE4-NEXT: movabsq $9007199254740991, %rbx # imm = 0x1FFFFFFFFFFFFF +; SSE4-NEXT: andq %rbx, %r11 +; SSE4-NEXT: shlq $53, %r15 +; SSE4-NEXT: orq %r11, %r15 +; SSE4-NEXT: movq %r15, 8(%rax) +; SSE4-NEXT: shlq $42, %r12 +; SSE4-NEXT: shrq $11, %r8 +; SSE4-NEXT: orq %r12, %r8 +; SSE4-NEXT: movq %r8, 24(%rax) +; SSE4-NEXT: movq %rdi, %r8 +; SSE4-NEXT: shrq $48, %r8 +; SSE4-NEXT: movb %r8b, 102(%rax) +; SSE4-NEXT: shrq $32, %rdi +; SSE4-NEXT: movw %di, 100(%rax) +; SSE4-NEXT: andq %rbx, %r13 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; SSE4-NEXT: shldq $9, %rdi, %r13 +; SSE4-NEXT: shlq $62, %rsi +; SSE4-NEXT: orq %r13, %rsi +; SSE4-NEXT: movq %rsi, 80(%rax) +; SSE4-NEXT: shlq $9, %rdi +; SSE4-NEXT: andl $511, %r9d # imm = 0x1FF +; SSE4-NEXT: orq %rdi, %r9 +; SSE4-NEXT: movq %r9, 72(%rax) +; SSE4-NEXT: shlq $20, %rdx +; SSE4-NEXT: andl $1048575, %ebp # imm = 0xFFFFF +; SSE4-NEXT: orq %rdx, %rbp +; SSE4-NEXT: movq %rbp, 56(%rax) +; SSE4-NEXT: shlq $31, %r10 ; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; SSE4-NEXT: orq %rdx, %rcx +; SSE4-NEXT: orq %r10, %rcx ; SSE4-NEXT: movq %rcx, 40(%rax) ; SSE4-NEXT: popq %rbx ; SSE4-NEXT: popq %r12 @@ -2076,132 +2122,132 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; AVX-NEXT: addb %r14b, %r14b -; AVX-NEXT: sarb %r14b +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX-NEXT: addb %dil, %dil +; AVX-NEXT: sarb %dil +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; AVX-NEXT: addb %r10b, %r10b +; AVX-NEXT: sarb %r10b +; AVX-NEXT: cmpb %dil, %r10b +; AVX-NEXT: setl %dil +; AVX-NEXT: setg %r10b +; AVX-NEXT: subb %dil, %r10b +; AVX-NEXT: movsbq %r10b, %rdi +; AVX-NEXT: movq %rdi, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: addb %bpl, %bpl +; AVX-NEXT: sarb %bpl ; AVX-NEXT: addb %sil, %sil ; AVX-NEXT: sarb %sil -; AVX-NEXT: cmpb %r14b, %sil +; AVX-NEXT: cmpb %bpl, %sil ; AVX-NEXT: setl %sil -; AVX-NEXT: setg %r14b -; AVX-NEXT: subb %sil, %r14b -; AVX-NEXT: movsbq %r14b, %r14 -; AVX-NEXT: movq %r14, (%rax) -; AVX-NEXT: sarq $63, %r14 +; AVX-NEXT: setg %bpl +; AVX-NEXT: subb %sil, %bpl +; AVX-NEXT: movsbq %bpl, %r12 +; AVX-NEXT: movq %r12, (%rax) +; AVX-NEXT: sarq $63, %r12 ; AVX-NEXT: addb %r15b, %r15b ; AVX-NEXT: sarb %r15b -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %esi ; AVX-NEXT: addb %sil, %sil ; AVX-NEXT: sarb %sil ; AVX-NEXT: cmpb %r15b, %sil ; AVX-NEXT: setl %sil -; AVX-NEXT: setg %r15b -; AVX-NEXT: subb %sil, %r15b -; AVX-NEXT: movsbq %r15b, %rsi -; AVX-NEXT: movq %rsi, %r12 -; AVX-NEXT: sarq $63, %r12 -; AVX-NEXT: addb %bpl, %bpl -; AVX-NEXT: sarb %bpl +; AVX-NEXT: setg %bpl +; AVX-NEXT: subb %sil, %bpl +; AVX-NEXT: movsbq %bpl, %rsi +; AVX-NEXT: movq %rsi, %r15 +; AVX-NEXT: sarq $63, %r15 +; AVX-NEXT: addb %r14b, %r14b +; AVX-NEXT: sarb %r14b ; AVX-NEXT: addb %dl, %dl ; AVX-NEXT: sarb %dl -; AVX-NEXT: cmpb %bpl, %dl +; AVX-NEXT: cmpb %r14b, %dl ; AVX-NEXT: setl %dl ; AVX-NEXT: setg %bpl ; AVX-NEXT: subb %dl, %bpl -; AVX-NEXT: movsbq %bpl, %r15 -; AVX-NEXT: movq %r15, %r13 -; AVX-NEXT: sarq $63, %r13 -; AVX-NEXT: addb %bl, %bl -; AVX-NEXT: sarb %bl +; AVX-NEXT: movsbq %bpl, %r14 +; AVX-NEXT: movq %r14, %rbp +; AVX-NEXT: sarq $63, %rbp +; AVX-NEXT: addb %r13b, %r13b +; AVX-NEXT: sarb %r13b ; AVX-NEXT: addb %cl, %cl ; AVX-NEXT: sarb %cl -; AVX-NEXT: cmpb %bl, %cl +; AVX-NEXT: cmpb %r13b, %cl ; AVX-NEXT: setl %cl ; AVX-NEXT: setg %dl ; AVX-NEXT: subb %cl, %dl -; AVX-NEXT: movsbq %dl, %rbx -; AVX-NEXT: movq %rbx, %rcx +; AVX-NEXT: movsbq %dl, %r13 +; AVX-NEXT: movq %r13, %rcx ; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: addb %r11b, %r11b -; AVX-NEXT: sarb %r11b +; AVX-NEXT: addb %bl, %bl +; AVX-NEXT: sarb %bl ; AVX-NEXT: addb %r8b, %r8b ; AVX-NEXT: sarb %r8b -; AVX-NEXT: cmpb %r11b, %r8b +; AVX-NEXT: cmpb %bl, %r8b ; AVX-NEXT: setl %dl ; AVX-NEXT: setg %r8b ; AVX-NEXT: subb %dl, %r8b ; AVX-NEXT: movsbq %r8b, %rdx ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: sarq $63, %r8 -; AVX-NEXT: addb %r10b, %r10b -; AVX-NEXT: sarb %r10b +; AVX-NEXT: addb %r11b, %r11b +; AVX-NEXT: sarb %r11b ; AVX-NEXT: addb %r9b, %r9b ; AVX-NEXT: sarb %r9b -; AVX-NEXT: cmpb %r10b, %r9b +; AVX-NEXT: cmpb %r11b, %r9b ; AVX-NEXT: setl %r9b -; AVX-NEXT: setg %r10b -; AVX-NEXT: subb %r9b, %r10b -; AVX-NEXT: movsbq %r10b, %r9 -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: addb %dil, %dil -; AVX-NEXT: sarb %dil -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; AVX-NEXT: addb %r11b, %r11b -; AVX-NEXT: sarb %r11b -; AVX-NEXT: cmpb %dil, %r11b -; AVX-NEXT: setl %dil ; AVX-NEXT: setg %r11b -; AVX-NEXT: subb %dil, %r11b -; AVX-NEXT: movsbq %r11b, %rdi -; AVX-NEXT: movq %rdi, %rbp -; AVX-NEXT: sarq $63, %rbp -; AVX-NEXT: movl %ebp, 96(%rax) -; AVX-NEXT: movb $51, %r11b -; AVX-NEXT: bzhiq %r11, %rbp, %r11 -; AVX-NEXT: shldq $62, %rdi, %rbp -; AVX-NEXT: movq %rbp, 88(%rax) -; AVX-NEXT: movq %r10, %rbp -; AVX-NEXT: shldq $20, %r9, %rbp -; AVX-NEXT: movq %rbp, 64(%rax) -; AVX-NEXT: movq %r8, %rbp -; AVX-NEXT: shldq $31, %rdx, %rbp -; AVX-NEXT: movq %rbp, 48(%rax) -; AVX-NEXT: movq %rcx, %rbp -; AVX-NEXT: shldq $42, %rbx, %rbp -; AVX-NEXT: movq %rbp, 32(%rax) -; AVX-NEXT: movb $42, %bpl -; AVX-NEXT: bzhiq %rbp, %r13, %rbp -; AVX-NEXT: shldq $53, %r15, %r13 -; AVX-NEXT: movq %r13, 16(%rax) -; AVX-NEXT: movq %r11, %r13 -; AVX-NEXT: shrq $48, %r13 -; AVX-NEXT: movb %r13b, 102(%rax) -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: movw %r11w, 100(%rax) -; AVX-NEXT: movb $53, %r11b -; AVX-NEXT: bzhiq %r11, %r12, %r12 -; AVX-NEXT: shldq $9, %rsi, %r12 -; AVX-NEXT: shlq $62, %rdi -; AVX-NEXT: orq %r12, %rdi -; AVX-NEXT: movq %rdi, 80(%rax) -; AVX-NEXT: shlq $42, %rbx -; AVX-NEXT: orq %rbp, %rbx -; AVX-NEXT: movq %rbx, 24(%rax) -; AVX-NEXT: bzhiq %r11, %r14, %rdi -; AVX-NEXT: shlq $53, %r15 -; AVX-NEXT: orq %rdi, %r15 -; AVX-NEXT: movq %r15, 8(%rax) -; AVX-NEXT: shlq $9, %rsi -; AVX-NEXT: andl $511, %r10d # imm = 0x1FF -; AVX-NEXT: orq %rsi, %r10 -; AVX-NEXT: movq %r10, 72(%rax) +; AVX-NEXT: subb %r9b, %r11b +; AVX-NEXT: movsbq %r11b, %r9 +; AVX-NEXT: movq %r9, %r11 +; AVX-NEXT: sarq $63, %r11 +; AVX-NEXT: movq %r11, %rbx +; AVX-NEXT: shldq $20, %r9, %rbx +; AVX-NEXT: movq %rbx, 64(%rax) +; AVX-NEXT: movq %r8, %rbx +; AVX-NEXT: shldq $31, %rdx, %rbx +; AVX-NEXT: movq %rbx, 48(%rax) +; AVX-NEXT: movq %rcx, %rbx +; AVX-NEXT: shldq $42, %r13, %rbx +; AVX-NEXT: movq %rbx, 32(%rax) +; AVX-NEXT: movb $42, %bl +; AVX-NEXT: bzhiq %rbx, %rbp, %rbx +; AVX-NEXT: shldq $53, %r14, %rbp +; AVX-NEXT: movq %rbp, 16(%rax) +; AVX-NEXT: movl %r15d, 96(%rax) +; AVX-NEXT: movb $51, %bpl +; AVX-NEXT: bzhiq %rbp, %r15, %rbp +; AVX-NEXT: shldq $62, %rsi, %r15 +; AVX-NEXT: movq %r15, 88(%rax) +; AVX-NEXT: shlq $42, %r13 +; AVX-NEXT: orq %rbx, %r13 +; AVX-NEXT: movq %r13, 24(%rax) +; AVX-NEXT: movb $53, %bl +; AVX-NEXT: bzhiq %rbx, %r12, %r15 +; AVX-NEXT: shlq $53, %r14 +; AVX-NEXT: orq %r15, %r14 +; AVX-NEXT: movq %r14, 8(%rax) +; AVX-NEXT: movq %rbp, %r14 +; AVX-NEXT: shrq $48, %r14 +; AVX-NEXT: movb %r14b, 102(%rax) +; AVX-NEXT: shrq $32, %rbp +; AVX-NEXT: movw %bp, 100(%rax) +; AVX-NEXT: bzhiq %rbx, %r10, %r10 +; AVX-NEXT: shldq $9, %rdi, %r10 +; AVX-NEXT: shlq $62, %rsi +; AVX-NEXT: orq %r10, %rsi +; AVX-NEXT: movq %rsi, 80(%rax) +; AVX-NEXT: shlq $9, %rdi +; AVX-NEXT: andl $511, %r11d # imm = 0x1FF +; AVX-NEXT: orq %rdi, %r11 +; AVX-NEXT: movq %r11, 72(%rax) ; AVX-NEXT: shlq $20, %r9 ; AVX-NEXT: andl $1048575, %r8d # imm = 0xFFFFF ; AVX-NEXT: orq %r9, %r8 @@ -2671,12 +2717,12 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl ; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp) ; X86-NEXT: setl %cl ; X86-NEXT: setg %dl ; X86-NEXT: subb %cl, %dl diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 7f17299b39e33..af2275eda305f 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind { ; X86-LABEL: ucmp.8.8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl @@ -35,7 +35,7 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind { ; X86-LABEL: ucmp.8.16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax +; X86-NEXT: cmpw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl @@ -54,7 +54,7 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { ; X86-LABEL: ucmp.8.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl @@ -155,7 +155,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { ; X86-LABEL: ucmp.32.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %eax @@ -245,7 +245,7 @@ define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind { ; X86-LABEL: ucmp_narrow_result: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl @@ -278,18 +278,18 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl $1073741823, %ecx # imm = 0x3FFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl %edx, %eax ; X86-NEXT: setb %al -; X86-NEXT: cmpl %edi, %esi -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -313,9 +313,9 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind { ; ; X86-LABEL: ucmp_wide_result: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movsbl %cl, %ecx @@ -366,26 +366,26 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl $8191, %ecx # imm = 0x1FFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $8191, %ecx # imm = 0x1FFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl $8191, %esi # imm = 0x1FFF ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: setb %al ; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -470,24 +470,24 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movsbl %dl, %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movsbl %bl, %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movsbl %bl, %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movsbl %cl, %ecx @@ -611,27 +611,27 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: sbbb $0, %cl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: seta %ch -; X86-NEXT: sbbb $0, %ch -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %bl -; X86-NEXT: sbbb $0, %bl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl -; X86-NEXT: movb %dl, 3(%eax) +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: seta %dh +; X86-NEXT: sbbb $0, %dh +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: seta %cl +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, 3(%eax) ; X86-NEXT: movb %bl, 2(%eax) -; X86-NEXT: movb %ch, 1(%eax) -; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: movb %dh, 1(%eax) +; X86-NEXT: movb %dl, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -691,24 +691,24 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movsbl %dl, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movsbl %bl, %esi -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp) ; X86-NEXT: seta %ch ; X86-NEXT: sbbb $0, %ch ; X86-NEXT: movsbl %ch, %edi -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movsbl %cl, %ecx @@ -767,38 +767,44 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; ; SSE2-LABEL: ucmp_wide_vec_result: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE2-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] ; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pmaxud %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-NEXT: pminud %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: psubd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE2-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pmaxud %xmm2, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pminud %xmm5, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE2-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pmaxud %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pminud %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: psubd %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; AVX2-LABEL: ucmp_wide_vec_result: @@ -812,8 +818,10 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -842,74 +850,74 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh +; X86-NEXT: cmpb %bh, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh +; X86-NEXT: cmpb %dh, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: cmpb %ah, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, (%esp) # 1-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bh ; X86-NEXT: sbbb $0, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movsbl %al, %ecx @@ -1368,72 +1376,72 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: seta %bh ; X86-NEXT: sbbb $0, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %dh ; X86-NEXT: sbbb $0, %dh ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %ch ; X86-NEXT: sbbb $0, %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1523,10 +1531,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax -; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: movq %rax, (%rsp) # 8-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax -; SSE4-NEXT: movq %rax, (%rsp) # 8-byte Spill +; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1554,240 +1562,240 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE4-NEXT: andl $127, %r10d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE4-NEXT: andl $127, %ebp ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: andl $127, %ecx ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; SSE4-NEXT: andl $127, %r8d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE4-NEXT: andl $127, %ebx -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE4-NEXT: andl $127, %edx ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; SSE4-NEXT: andl $127, %r13d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE4-NEXT: andl $127, %r11d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: andl $127, %r10d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: andl $127, %edx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE4-NEXT: andl $127, %r15d ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE4-NEXT: andl $127, %r14d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE4-NEXT: andl $127, %r12d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE4-NEXT: cmpq %rax, %rbp -; SSE4-NEXT: movq %r12, %r15 -; SSE4-NEXT: sbbq %r14, %r15 -; SSE4-NEXT: setb %r15b -; SSE4-NEXT: cmpq %rbp, %rax -; SSE4-NEXT: sbbq %r12, %r14 -; SSE4-NEXT: sbbb $0, %r15b -; SSE4-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE4-NEXT: cmpq %rax, %r14 -; SSE4-NEXT: movq %r11, %r15 -; SSE4-NEXT: sbbq %r13, %r15 -; SSE4-NEXT: setb %bpl -; SSE4-NEXT: cmpq %r14, %rax -; SSE4-NEXT: sbbq %r11, %r13 -; SSE4-NEXT: sbbb $0, %bpl -; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE4-NEXT: cmpq %rax, %r11 -; SSE4-NEXT: movq %rdx, %r14 -; SSE4-NEXT: sbbq %rbx, %r14 -; SSE4-NEXT: setb %bpl -; SSE4-NEXT: cmpq %r11, %rax -; SSE4-NEXT: sbbq %rdx, %rbx -; SSE4-NEXT: sbbb $0, %bpl -; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: andl $127, %r11d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE4-NEXT: andl $127, %ecx ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE4-NEXT: cmpq %rax, %rdx -; SSE4-NEXT: movq %r8, %r11 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE4-NEXT: cmpq %rax, %r12 +; SSE4-NEXT: movq %rcx, %rbx +; SSE4-NEXT: sbbq %r11, %rbx +; SSE4-NEXT: setb %bl +; SSE4-NEXT: cmpq %r12, %rax ; SSE4-NEXT: sbbq %rcx, %r11 +; SSE4-NEXT: sbbb $0, %bl +; SSE4-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq %r14, %r11 +; SSE4-NEXT: sbbq %r15, %r11 ; SSE4-NEXT: setb %r11b -; SSE4-NEXT: cmpq %rdx, %rax -; SSE4-NEXT: sbbq %r8, %rcx +; SSE4-NEXT: cmpq %rcx, %rax +; SSE4-NEXT: sbbq %r14, %r15 ; SSE4-NEXT: sbbb $0, %r11b ; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rdx -; SSE4-NEXT: sbbq %r10, %rdx -; SSE4-NEXT: setb %dl +; SSE4-NEXT: movq %rdx, %r11 +; SSE4-NEXT: sbbq %r10, %r11 +; SSE4-NEXT: setb %r11b ; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r8, %r10 -; SSE4-NEXT: sbbb $0, %dl -; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: sbbq %rdx, %r10 +; SSE4-NEXT: sbbb $0, %r11b +; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE4-NEXT: movq %r13, %rdx ; SSE4-NEXT: sbbq %r8, %rdx -; SSE4-NEXT: setb %r10b +; SSE4-NEXT: setb %dl ; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 -; SSE4-NEXT: sbbb $0, %r10b +; SSE4-NEXT: sbbq %r13, %r8 +; SSE4-NEXT: sbbb $0, %dl +; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rdx +; SSE4-NEXT: movq %r8, %rdx +; SSE4-NEXT: sbbq %rbp, %rdx ; SSE4-NEXT: setb %dl ; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 +; SSE4-NEXT: sbbq %r8, %rbp ; SSE4-NEXT: sbbb $0, %dl ; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: movq %r10, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; SSE4-NEXT: sbbq %r8, %rdx -; SSE4-NEXT: setb %bpl +; SSE4-NEXT: setb %dl ; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 -; SSE4-NEXT: sbbb $0, %bpl +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: sbbb $0, %dl +; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: movq %r10, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; SSE4-NEXT: sbbq %r8, %rdx ; SSE4-NEXT: setb %dl ; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 +; SSE4-NEXT: sbbq %r10, %r8 ; SSE4-NEXT: sbbb $0, %dl -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE4-NEXT: movq %r14, %r8 -; SSE4-NEXT: movq (%rsp), %rbx # 8-byte Reload -; SSE4-NEXT: sbbq %rbx, %r8 -; SSE4-NEXT: setb %r11b -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r14, %rbx -; SSE4-NEXT: sbbb $0, %r11b -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rcx, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: movq %r10, %rax +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE4-NEXT: sbbq %r8, %rax +; SSE4-NEXT: setb %r12b +; SSE4-NEXT: cmpq %rdx, %rcx +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: sbbb $0, %r12b ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE4-NEXT: movq %r14, %rbx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: cmpq %rcx, %r10 ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rbx +; SSE4-NEXT: movq %r8, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %rdx +; SSE4-NEXT: setb %bpl +; SSE4-NEXT: cmpq %r10, %rcx +; SSE4-NEXT: sbbq %r8, %rax +; SSE4-NEXT: sbbb $0, %bpl +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE4-NEXT: cmpq %r10, %r11 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE4-NEXT: movq %rdx, %rcx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %rcx +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %r11, %r10 +; SSE4-NEXT: sbbq %rdx, %rax +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE4-NEXT: cmpq %r11, %rbx +; SSE4-NEXT: movq (%rsp), %rcx # 8-byte Reload +; SSE4-NEXT: movq %rcx, %r10 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %r10 +; SSE4-NEXT: setb %r10b +; SSE4-NEXT: cmpq %rbx, %r11 +; SSE4-NEXT: sbbq %rcx, %rax +; SSE4-NEXT: sbbb $0, %r10b +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE4-NEXT: cmpq %rbx, %r14 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: movq %rcx, %r11 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %r11 +; SSE4-NEXT: setb %r11b +; SSE4-NEXT: cmpq %r14, %rbx +; SSE4-NEXT: sbbq %rcx, %rax +; SSE4-NEXT: sbbb $0, %r11b +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE4-NEXT: cmpq %r14, %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: movq %rcx, %rbx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %rbx ; SSE4-NEXT: setb %bl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r14, %r8 +; SSE4-NEXT: cmpq %r15, %r14 +; SSE4-NEXT: sbbq %rcx, %rax ; SSE4-NEXT: sbbb $0, %bl -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE4-NEXT: cmpq %rax, %r14 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE4-NEXT: movq %r15, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rcx -; SSE4-NEXT: setb %cl -; SSE4-NEXT: cmpq %r14, %rax -; SSE4-NEXT: sbbq %r15, %r8 -; SSE4-NEXT: sbbb $0, %cl -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE4-NEXT: cmpq %rax, %r15 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: movq %r12, %r14 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r14 +; SSE4-NEXT: cmpq %r9, %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: movq %rcx, %r14 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %r14 ; SSE4-NEXT: setb %r14b -; SSE4-NEXT: cmpq %r15, %rax -; SSE4-NEXT: sbbq %r12, %r8 +; SSE4-NEXT: cmpq %r15, %r9 +; SSE4-NEXT: sbbq %rcx, %rax ; SSE4-NEXT: sbbb $0, %r14b -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: cmpq %r9, %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: movq %r12, %r15 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r15 -; SSE4-NEXT: setb %r15b -; SSE4-NEXT: cmpq %rax, %r9 -; SSE4-NEXT: sbbq %r12, %r8 -; SSE4-NEXT: sbbb $0, %r15b -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: cmpq %r12, %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE4-NEXT: movq %r13, %r9 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r9 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: cmpq %rcx, %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE4-NEXT: movq %rdx, %r9 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE4-NEXT: sbbq %rax, %r9 ; SSE4-NEXT: setb %r9b -; SSE4-NEXT: cmpq %rax, %r12 -; SSE4-NEXT: sbbq %r13, %r8 -; SSE4-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE4-NEXT: cmpq %r15, %rcx +; SSE4-NEXT: sbbq %rdx, %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; SSE4-NEXT: sbbb $0, %r9b -; SSE4-NEXT: cmpq %rsi, %r12 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rdi +; SSE4-NEXT: cmpq %rsi, %r13 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: movq %rcx, %r15 ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: sbbq %rax, %rdi -; SSE4-NEXT: setb %dil -; SSE4-NEXT: cmpq %r12, %rsi -; SSE4-NEXT: sbbq %r8, %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE4-NEXT: sbbq %rax, %r15 +; SSE4-NEXT: setb %r15b +; SSE4-NEXT: cmpq %r13, %rsi +; SSE4-NEXT: sbbq %rcx, %rax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE4-NEXT: sbbb $0, %dil -; SSE4-NEXT: cmpq %r12, %r13 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rsi -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: sbbq %rax, %rsi +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: sbbb $0, %r15b +; SSE4-NEXT: cmpq %r13, %rax +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE4-NEXT: movq %rdx, %rsi +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE4-NEXT: sbbq %rcx, %rsi ; SSE4-NEXT: setb %sil -; SSE4-NEXT: cmpq %r13, %r12 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm1 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm2 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm3 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm4 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm5 -; SSE4-NEXT: movzbl %r10b, %r10d -; SSE4-NEXT: movd %r10d, %xmm6 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE4-NEXT: movd %r10d, %xmm7 -; SSE4-NEXT: movzbl %bpl, %r10d -; SSE4-NEXT: movd %r10d, %xmm0 -; SSE4-NEXT: movzbl %dl, %edx -; SSE4-NEXT: movd %edx, %xmm8 -; SSE4-NEXT: movzbl %r11b, %edx -; SSE4-NEXT: movd %edx, %xmm9 -; SSE4-NEXT: movzbl %bl, %edx -; SSE4-NEXT: movd %edx, %xmm10 -; SSE4-NEXT: movzbl %cl, %ecx -; SSE4-NEXT: movd %ecx, %xmm11 -; SSE4-NEXT: movzbl %r14b, %ecx -; SSE4-NEXT: movd %ecx, %xmm12 -; SSE4-NEXT: movzbl %r15b, %ecx -; SSE4-NEXT: movd %ecx, %xmm13 -; SSE4-NEXT: movzbl %r9b, %ecx -; SSE4-NEXT: movd %ecx, %xmm14 -; SSE4-NEXT: movzbl %dil, %ecx -; SSE4-NEXT: movd %ecx, %xmm15 +; SSE4-NEXT: cmpq %rax, %r13 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm1 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm2 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm3 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm4 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm5 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm6 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE4-NEXT: movd %eax, %xmm7 +; SSE4-NEXT: movzbl %r12b, %eax +; SSE4-NEXT: movd %eax, %xmm0 +; SSE4-NEXT: movzbl %bpl, %eax +; SSE4-NEXT: movd %eax, %xmm8 +; SSE4-NEXT: movzbl %r8b, %eax +; SSE4-NEXT: movd %eax, %xmm9 +; SSE4-NEXT: movzbl %r10b, %eax +; SSE4-NEXT: movd %eax, %xmm10 +; SSE4-NEXT: movzbl %r11b, %eax +; SSE4-NEXT: movd %eax, %xmm11 +; SSE4-NEXT: movzbl %bl, %eax +; SSE4-NEXT: movd %eax, %xmm12 +; SSE4-NEXT: movzbl %r14b, %eax +; SSE4-NEXT: movd %eax, %xmm13 +; SSE4-NEXT: movzbl %r9b, %eax +; SSE4-NEXT: movd %eax, %xmm14 +; SSE4-NEXT: movzbl %r15b, %eax +; SSE4-NEXT: movd %eax, %xmm15 ; SSE4-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE4-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE4-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] @@ -1802,76 +1810,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE4-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; SSE4-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; SSE4-NEXT: sbbq %r8, %rax +; SSE4-NEXT: sbbq %rdx, %rcx ; SSE4-NEXT: sbbb $0, %sil ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE4-NEXT: movzbl %sil, %ecx -; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: movb %cl, 4(%rax) +; SSE4-NEXT: movzbl %sil, %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: movb %al, 4(%rdi) ; SSE4-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE4-NEXT: andl $3, %ecx +; SSE4-NEXT: leaq (%rcx,%rax,4), %rax +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE4-NEXT: andl $3, %ecx +; SSE4-NEXT: shll $4, %ecx +; SSE4-NEXT: orq %rax, %rcx +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: shll $6, %eax +; SSE4-NEXT: orq %rcx, %rax ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE4-NEXT: andl $3, %ecx +; SSE4-NEXT: shll $8, %ecx +; SSE4-NEXT: orq %rax, %rcx +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: shll $10, %eax ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: leaq (%rdx,%rcx,4), %rcx +; SSE4-NEXT: shll $12, %edx +; SSE4-NEXT: orq %rax, %rdx +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE4-NEXT: andl $3, %esi +; SSE4-NEXT: shll $14, %esi +; SSE4-NEXT: orq %rdx, %rsi +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: shll $16, %eax +; SSE4-NEXT: orq %rsi, %rax +; SSE4-NEXT: orq %rcx, %rax +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE4-NEXT: andl $3, %ecx +; SSE4-NEXT: shll $18, %ecx ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: shll $4, %edx +; SSE4-NEXT: shll $20, %edx ; SSE4-NEXT: orq %rcx, %rdx ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: shll $6, %ecx +; SSE4-NEXT: shll $22, %ecx ; SSE4-NEXT: orq %rdx, %rcx ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: shll $8, %edx +; SSE4-NEXT: shll $24, %edx ; SSE4-NEXT: orq %rcx, %rdx ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: shll $10, %ecx -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE4-NEXT: andl $3, %esi -; SSE4-NEXT: shll $12, %esi -; SSE4-NEXT: orq %rcx, %rsi -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE4-NEXT: andl $3, %edi -; SSE4-NEXT: shll $14, %edi -; SSE4-NEXT: orq %rsi, %rdi -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: shll $16, %ecx -; SSE4-NEXT: orq %rdi, %rcx +; SSE4-NEXT: shlq $26, %rcx ; SSE4-NEXT: orq %rdx, %rcx +; SSE4-NEXT: orq %rax, %rcx +; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: andl $3, %eax +; SSE4-NEXT: shlq $28, %rax ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: shll $18, %edx -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE4-NEXT: andl $3, %esi -; SSE4-NEXT: shll $20, %esi -; SSE4-NEXT: orq %rdx, %rsi -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: shll $22, %edx -; SSE4-NEXT: orq %rsi, %rdx -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE4-NEXT: andl $3, %esi -; SSE4-NEXT: shll $24, %esi -; SSE4-NEXT: orq %rdx, %rsi -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE4-NEXT: andl $3, %edx -; SSE4-NEXT: shlq $26, %rdx -; SSE4-NEXT: orq %rsi, %rdx +; SSE4-NEXT: shlq $30, %rdx +; SSE4-NEXT: orq %rax, %rdx ; SSE4-NEXT: orq %rcx, %rdx -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: shlq $28, %rcx -; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE4-NEXT: andl $3, %esi -; SSE4-NEXT: shlq $30, %rsi -; SSE4-NEXT: orq %rcx, %rsi -; SSE4-NEXT: orq %rdx, %rsi -; SSE4-NEXT: movl %esi, (%rax) +; SSE4-NEXT: movl %edx, (%rdi) +; SSE4-NEXT: movq %rdi, %rax ; SSE4-NEXT: addq $120, %rsp ; SSE4-NEXT: popq %rbx ; SSE4-NEXT: popq %r12 @@ -1961,88 +1969,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: andl $127, %ecx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: andl $127, %r11d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE2-NEXT: andl $127, %ebx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE2-NEXT: andl $127, %ebp ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: andl $127, %edx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: andl $127, %r10d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: andl $127, %r14d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: andl $127, %ebp -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE2-NEXT: andl $127, %r13d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE2-NEXT: andl $127, %r11d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: andl $127, %r8d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: andl $127, %eax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: andl $127, %ebx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE2-NEXT: andl $127, %r15d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: cmpq %r13, %rcx +; SSE2-NEXT: movq %r15, %r12 +; SSE2-NEXT: sbbq %rbx, %r12 +; SSE2-NEXT: setb %r12b +; SSE2-NEXT: cmpq %rcx, %r13 +; SSE2-NEXT: sbbq %r15, %rbx +; SSE2-NEXT: sbbb $0, %r12b +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: cmpq %rcx, %rbx +; SSE2-NEXT: movq %rax, %r15 +; SSE2-NEXT: sbbq %r8, %r15 +; SSE2-NEXT: setb %r15b +; SSE2-NEXT: cmpq %rbx, %rcx +; SSE2-NEXT: sbbq %rax, %r8 +; SSE2-NEXT: sbbb $0, %r15b +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: cmpq %rax, %r12 -; SSE2-NEXT: movq %r15, %r8 -; SSE2-NEXT: sbbq %r11, %r8 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq %r14, %r8 +; SSE2-NEXT: sbbq %r10, %r8 ; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %r12, %rax -; SSE2-NEXT: sbbq %r15, %r11 +; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: sbbq %r14, %r10 ; SSE2-NEXT: sbbb $0, %r8b ; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r13, %r11 -; SSE2-NEXT: sbbq %rbp, %r11 -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r13, %rbp -; SSE2-NEXT: sbbb $0, %r11b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r14, %r11 -; SSE2-NEXT: sbbq %r10, %r11 -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r14, %r10 -; SSE2-NEXT: sbbb $0, %r11b -; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %rdx, %rbx -; SSE2-NEXT: sbbb $0, %r10b -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: cmpq %rax, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: sbbq %rcx, %r8 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq %rdx, %r8 +; SSE2-NEXT: sbbq %rbp, %r8 ; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %rdx, %rax -; SSE2-NEXT: sbbq %r10, %rcx +; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: sbbq %rdx, %rbp ; SSE2-NEXT: sbbb $0, %r8b ; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: sbbq %r8, %rdx +; SSE2-NEXT: movq %r8, %rdx +; SSE2-NEXT: sbbq %r11, %rdx ; SSE2-NEXT: setb %dl ; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r10, %r8 +; SSE2-NEXT: sbbq %r8, %r11 ; SSE2-NEXT: sbbb $0, %dl ; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2058,117 +2054,129 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: sbbb $0, %dl ; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: movq %r11, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx -; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %r11, %r10 -; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movq %r10, %rcx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: sbbq %r8, %rcx +; SSE2-NEXT: setb %cl +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r10, %r8 +; SSE2-NEXT: sbbb $0, %cl +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: sbbb $0, %r11b +; SSE2-NEXT: movq %r10, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: sbbq %rcx, %r8 +; SSE2-NEXT: setb %r12b +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r10, %rcx +; SSE2-NEXT: sbbb $0, %r12b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE2-NEXT: sbbq %r10, %r8 +; SSE2-NEXT: setb %cl +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r11, %r10 +; SSE2-NEXT: sbbb $0, %cl +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: cmpq %rax, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; SSE2-NEXT: sbbq %r10, %rdx ; SSE2-NEXT: setb %dl -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 +; SSE2-NEXT: cmpq %r8, %rax +; SSE2-NEXT: sbbq %r11, %r10 ; SSE2-NEXT: sbbb $0, %dl ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: cmpq %rax, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE2-NEXT: movq %rbx, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: sbbq %r11, %r8 +; SSE2-NEXT: setb %r8b +; SSE2-NEXT: cmpq %r10, %rax +; SSE2-NEXT: sbbq %rbx, %r11 +; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: cmpq %rax, %r11 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; SSE2-NEXT: movq %r14, %r10 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; SSE2-NEXT: sbbq %rbx, %r10 ; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: cmpq %r11, %rax ; SSE2-NEXT: sbbq %r14, %rbx ; SSE2-NEXT: sbbb $0, %r10b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE2-NEXT: cmpq %rax, %rbx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %rcx -; SSE2-NEXT: setb %cl -; SSE2-NEXT: cmpq %rbx, %rax -; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: sbbb $0, %cl -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: cmpq %rax, %r14 -; SSE2-NEXT: movq (%rsp), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %rbx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: sbbq %r15, %rbx -; SSE2-NEXT: setb %bl +; SSE2-NEXT: movq (%rsp), %r15 # 8-byte Reload +; SSE2-NEXT: movq %r15, %r11 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE2-NEXT: sbbq %rbx, %r11 +; SSE2-NEXT: setb %r11b ; SSE2-NEXT: cmpq %r14, %rax -; SSE2-NEXT: sbbq %r12, %r15 -; SSE2-NEXT: sbbb $0, %bl +; SSE2-NEXT: sbbq %r15, %rbx +; SSE2-NEXT: sbbb $0, %r11b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: cmpq %r9, %rax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %r14 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: sbbq %r15, %r14 +; SSE2-NEXT: movq %r15, %r14 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE2-NEXT: sbbq %rbx, %r14 ; SSE2-NEXT: setb %bpl ; SSE2-NEXT: cmpq %rax, %r9 -; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: sbbq %r15, %rbx ; SSE2-NEXT: sbbb $0, %bpl ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: cmpq %rsi, %rax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %r9 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %r9 +; SSE2-NEXT: movq %r14, %r9 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE2-NEXT: sbbq %rbx, %r9 ; SSE2-NEXT: setb %r9b ; SSE2-NEXT: cmpq %rax, %rsi -; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: sbbq %r14, %rbx +; SSE2-NEXT: movq %rdi, %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: sbbb $0, %r9b ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; SSE2-NEXT: cmpq %r15, %rsi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %rdi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; SSE2-NEXT: sbbq %r14, %rdi ; SSE2-NEXT: setb %dil ; SSE2-NEXT: cmpq %rsi, %r15 -; SSE2-NEXT: sbbq %r12, %r14 +; SSE2-NEXT: sbbq %rax, %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: sbbb $0, %dil ; SSE2-NEXT: cmpq %rsi, %r14 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %rax, %r15 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE2-NEXT: movq %r13, %r15 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: sbbq %r13, %r15 ; SSE2-NEXT: setb %r15b ; SSE2-NEXT: cmpq %r14, %rsi -; SSE2-NEXT: sbbq %r13, %r12 +; SSE2-NEXT: sbbq %rax, %r13 ; SSE2-NEXT: sbbb $0, %r15b ; SSE2-NEXT: movzbl %r15b, %esi ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: movb %sil, 4(%rax) +; SSE2-NEXT: movb %sil, 4(%rbx) ; SSE2-NEXT: movzbl %dil, %esi ; SSE2-NEXT: movzbl %r9b, %edi ; SSE2-NEXT: andl $3, %esi @@ -2178,59 +2186,60 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: andl $3, %edi ; SSE2-NEXT: shll $4, %edi ; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: movzbl %bl, %r9d +; SSE2-NEXT: movzbl %r11b, %r9d ; SSE2-NEXT: andl $3, %r9d ; SSE2-NEXT: shll $6, %r9d ; SSE2-NEXT: orq %rdi, %r9 -; SSE2-NEXT: movzbl %cl, %esi +; SSE2-NEXT: movzbl %r10b, %esi ; SSE2-NEXT: andl $3, %esi ; SSE2-NEXT: shll $8, %esi ; SSE2-NEXT: orq %r9, %rsi -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $10, %edx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $12, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shll $10, %edi ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $14, %edx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movzbl %r8b, %ecx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $18, %esi +; SSE2-NEXT: shll $12, %edx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shll $14, %edi +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: movzbl %r12b, %edx ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $20, %edx +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: orq %rdi, %rdx ; SSE2-NEXT: orq %rsi, %rdx ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %ecx +; SSE2-NEXT: shll $18, %ecx ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $22, %esi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $24, %edx -; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: shll $20, %esi +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %ecx +; SSE2-NEXT: shll $22, %ecx +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shlq $26, %rsi -; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: shll $24, %esi ; SSE2-NEXT: orq %rcx, %rsi ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shlq $28, %rdx ; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shlq $30, %rcx -; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: shlq $26, %rcx ; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movl %ecx, (%rax) +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %esi +; SSE2-NEXT: shlq $28, %rsi +; SSE2-NEXT: andl $3, %edx +; SSE2-NEXT: shlq $30, %rdx +; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movl %edx, (%rbx) +; SSE2-NEXT: movq %rbx, %rax ; SSE2-NEXT: addq $88, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -2333,34 +2342,34 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: andl $127, %r14d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: andl $127, %edx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX2-NEXT: andl $127, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: andl $127, %ebx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: andl $127, %r11d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX2-NEXT: andl $127, %r8d -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX2-NEXT: andl $127, %r12d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX2-NEXT: andl $127, %r13d -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: cmpq %rbx, %r11 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX2-NEXT: cmpq %r12, %rbp ; AVX2-NEXT: movq %r13, %r10 -; AVX2-NEXT: sbbq %r12, %r10 +; AVX2-NEXT: sbbq %r8, %r10 ; AVX2-NEXT: setb %r10b -; AVX2-NEXT: cmpq %r11, %rbx -; AVX2-NEXT: sbbq %r13, %r12 +; AVX2-NEXT: cmpq %rbp, %r12 +; AVX2-NEXT: sbbq %r13, %r8 ; AVX2-NEXT: sbbb $0, %r10b ; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: cmpq %r10, %r11 -; AVX2-NEXT: movq %r8, %rbx -; AVX2-NEXT: sbbq %rbp, %rbx -; AVX2-NEXT: setb %bl -; AVX2-NEXT: cmpq %r11, %r10 -; AVX2-NEXT: sbbq %r8, %rbp -; AVX2-NEXT: sbbb $0, %bl -; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: cmpq %r8, %r10 +; AVX2-NEXT: movq %r11, %r12 +; AVX2-NEXT: sbbq %rbx, %r12 +; AVX2-NEXT: setb %bpl +; AVX2-NEXT: cmpq %r10, %r8 +; AVX2-NEXT: sbbq %r11, %rbx +; AVX2-NEXT: sbbb $0, %bpl +; AVX2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: cmpq %r8, %r10 @@ -2431,13 +2440,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: cmpq %rax, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: movq %rbx, %r10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: movq %r11, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: sbbq %r10, %r8 +; AVX2-NEXT: sbbq %r11, %r10 ; AVX2-NEXT: setb %r8b ; AVX2-NEXT: cmpq %rdx, %rax -; AVX2-NEXT: sbbq %r11, %r10 +; AVX2-NEXT: sbbq %rbx, %r11 ; AVX2-NEXT: sbbb $0, %r8b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 @@ -2451,81 +2460,81 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: sbbq %rbx, %r11 ; AVX2-NEXT: sbbb $0, %dl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: cmpq %rax, %r11 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: cmpq %rax, %rbx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: sbbq %rbx, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: sbbq %r11, %r10 ; AVX2-NEXT: setb %r10b -; AVX2-NEXT: cmpq %r11, %rax -; AVX2-NEXT: sbbq %r14, %rbx -; AVX2-NEXT: sbbb $0, %r10b -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX2-NEXT: cmpq %rax, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: movq %r15, %r11 -; AVX2-NEXT: movq (%rsp), %r14 # 8-byte Reload -; AVX2-NEXT: sbbq %r14, %r11 -; AVX2-NEXT: setb %r11b ; AVX2-NEXT: cmpq %rbx, %rax -; AVX2-NEXT: sbbq %r15, %r14 -; AVX2-NEXT: sbbb $0, %r11b +; AVX2-NEXT: sbbq %r14, %r11 +; AVX2-NEXT: sbbb $0, %r10b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX2-NEXT: cmpq %rax, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %rbx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: sbbq %r15, %rbx +; AVX2-NEXT: movq %r15, %rbx +; AVX2-NEXT: movq (%rsp), %r11 # 8-byte Reload +; AVX2-NEXT: sbbq %r11, %rbx ; AVX2-NEXT: setb %bl ; AVX2-NEXT: cmpq %r14, %rax -; AVX2-NEXT: sbbq %r13, %r15 +; AVX2-NEXT: sbbq %r15, %r11 ; AVX2-NEXT: sbbb $0, %bl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: cmpq %r9, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX2-NEXT: cmpq %rax, %r14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: sbbq %r15, %r14 +; AVX2-NEXT: movq %r13, %r15 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: sbbq %r11, %r15 ; AVX2-NEXT: setb %bpl -; AVX2-NEXT: cmpq %rax, %r9 -; AVX2-NEXT: sbbq %r13, %r15 +; AVX2-NEXT: cmpq %r14, %rax +; AVX2-NEXT: sbbq %r13, %r11 ; AVX2-NEXT: sbbb $0, %bpl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: cmpq %r9, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: movq %r15, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: sbbq %r11, %r14 +; AVX2-NEXT: setb %r14b +; AVX2-NEXT: cmpq %rax, %r9 +; AVX2-NEXT: sbbq %r15, %r11 +; AVX2-NEXT: sbbb $0, %r14b +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: cmpq %rsi, %rax ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; AVX2-NEXT: movq %r15, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: sbbq %r14, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: sbbq %r11, %r9 ; AVX2-NEXT: setb %r9b ; AVX2-NEXT: cmpq %rax, %rsi -; AVX2-NEXT: sbbq %r15, %r14 +; AVX2-NEXT: sbbq %r15, %r11 ; AVX2-NEXT: sbbb $0, %r9b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: cmpq %rcx, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: movq %r11, %rsi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: movq %r15, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: sbbq %r14, %rsi +; AVX2-NEXT: sbbq %r15, %rsi ; AVX2-NEXT: setb %sil ; AVX2-NEXT: cmpq %rax, %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: sbbq %r15, %r14 +; AVX2-NEXT: sbbq %r11, %r15 ; AVX2-NEXT: sbbb $0, %sil ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: cmpq %rax, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: movq %r11, %r15 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: sbbq %r15, %r14 -; AVX2-NEXT: setb %r14b -; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: sbbq %r13, %r15 +; AVX2-NEXT: setb %r15b +; AVX2-NEXT: cmpq %rcx, %rax +; AVX2-NEXT: sbbq %r11, %r13 ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: sbbb $0, %r14b -; AVX2-NEXT: movzbl %r14b, %ecx +; AVX2-NEXT: sbbb $0, %r15b +; AVX2-NEXT: movzbl %r15b, %ecx ; AVX2-NEXT: andl $3, %ecx ; AVX2-NEXT: movb %cl, 4(%rdi) ; AVX2-NEXT: movzbl %sil, %ecx @@ -2533,15 +2542,15 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: movzbl %r9b, %esi ; AVX2-NEXT: andl $3, %esi ; AVX2-NEXT: leaq (%rsi,%rcx,4), %rcx -; AVX2-NEXT: movzbl %bpl, %esi +; AVX2-NEXT: movzbl %r14b, %esi ; AVX2-NEXT: andl $3, %esi ; AVX2-NEXT: shll $4, %esi ; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: movzbl %bl, %ecx +; AVX2-NEXT: movzbl %bpl, %ecx ; AVX2-NEXT: andl $3, %ecx ; AVX2-NEXT: shll $6, %ecx ; AVX2-NEXT: orq %rsi, %rcx -; AVX2-NEXT: movzbl %r11b, %esi +; AVX2-NEXT: movzbl %bl, %esi ; AVX2-NEXT: andl $3, %esi ; AVX2-NEXT: shll $8, %esi ; AVX2-NEXT: orq %rcx, %rsi @@ -2676,18 +2685,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: andl $127, %eax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX512-NEXT: andl $127, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: andl $127, %r12d ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX512-NEXT: andl $127, %r13d +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX512-NEXT: andl $127, %ebp ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: andl $127, %r15d +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512-NEXT: andl $127, %r12d ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: andl $127, %r10d -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: andl $127, %ebx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: andl $127, %r14d ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: andl $127, %r8d ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 @@ -2700,13 +2709,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: andl $127, %eax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: andl $127, %edx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: cmpq %r14, %r11 +; AVX512-NEXT: cmpq %rbx, %r11 ; AVX512-NEXT: movq %rdx, %rcx ; AVX512-NEXT: sbbq %rax, %rcx ; AVX512-NEXT: setb %cl -; AVX512-NEXT: cmpq %r11, %r14 +; AVX512-NEXT: cmpq %r11, %rbx ; AVX512-NEXT: sbbq %rdx, %rax ; AVX512-NEXT: sbbb $0, %cl ; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -2733,31 +2742,31 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: cmpq %rax, %rcx -; AVX512-NEXT: movq %rbx, %rdx +; AVX512-NEXT: movq %r14, %rdx ; AVX512-NEXT: sbbq %r10, %rdx ; AVX512-NEXT: setb %dl ; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: sbbq %rbx, %r10 +; AVX512-NEXT: sbbq %r14, %r10 ; AVX512-NEXT: sbbb $0, %dl ; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: cmpq %rax, %rcx -; AVX512-NEXT: movq %r15, %rdx -; AVX512-NEXT: sbbq %r13, %rdx +; AVX512-NEXT: movq %r12, %rdx +; AVX512-NEXT: sbbq %r15, %rdx ; AVX512-NEXT: setb %dl ; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: sbbq %r15, %r13 +; AVX512-NEXT: sbbq %r12, %r15 ; AVX512-NEXT: sbbb $0, %dl ; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: cmpq %rax, %rcx -; AVX512-NEXT: movq %r12, %rdx -; AVX512-NEXT: sbbq %rbp, %rdx +; AVX512-NEXT: movq %rbp, %rdx +; AVX512-NEXT: sbbq %r13, %rdx ; AVX512-NEXT: setb %dl ; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: sbbq %r12, %rbp +; AVX512-NEXT: sbbq %rbp, %r13 ; AVX512-NEXT: sbbb $0, %dl ; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2767,21 +2776,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: movq %rdi, %rdx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX512-NEXT: sbbq %rsi, %rdx -; AVX512-NEXT: setb %r13b -; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: sbbq %rdi, %rsi -; AVX512-NEXT: sbbb $0, %r13b -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: cmpq %rax, %rcx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq %rdi, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: sbbq %rsi, %rdx -; AVX512-NEXT: setb %bpl +; AVX512-NEXT: setb %r15b ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: sbbq %rdi, %rsi -; AVX512-NEXT: sbbb $0, %bpl +; AVX512-NEXT: sbbb $0, %r15b ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: cmpq %rcx, %rdx @@ -2789,10 +2787,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX512-NEXT: sbbq %rsi, %rax -; AVX512-NEXT: setb %r9b +; AVX512-NEXT: setb %bl ; AVX512-NEXT: cmpq %rdx, %rcx ; AVX512-NEXT: sbbq %rdi, %rsi -; AVX512-NEXT: sbbb $0, %r9b +; AVX512-NEXT: sbbb $0, %bl ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: cmpq %rdx, %rsi @@ -2818,107 +2816,118 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX512-NEXT: cmpq %rdi, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: movq %r10, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: movq %r9, %rsi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %rsi ; AVX512-NEXT: setb %sil ; AVX512-NEXT: cmpq %r8, %rdi -; AVX512-NEXT: sbbq %r10, %rax +; AVX512-NEXT: sbbq %r9, %rax ; AVX512-NEXT: sbbb $0, %sil ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: cmpq %r8, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: movq %r11, %rdi +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX512-NEXT: cmpq %r8, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: movq %r10, %rdi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %rdi ; AVX512-NEXT: setb %dil -; AVX512-NEXT: cmpq %r10, %r8 -; AVX512-NEXT: sbbq %r11, %rax +; AVX512-NEXT: cmpq %r9, %r8 +; AVX512-NEXT: sbbq %r10, %rax ; AVX512-NEXT: sbbb $0, %dil +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: cmpq %r9, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: movq %r11, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: sbbq %rax, %r8 +; AVX512-NEXT: setb %r8b +; AVX512-NEXT: cmpq %r10, %r9 +; AVX512-NEXT: sbbq %r11, %rax +; AVX512-NEXT: sbbb $0, %r8b ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: cmpq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq %rbx, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: movq %r14, %r9 ; AVX512-NEXT: movq (%rsp), %r11 # 8-byte Reload -; AVX512-NEXT: sbbq %r11, %r8 -; AVX512-NEXT: setb %r8b +; AVX512-NEXT: sbbq %r11, %r9 +; AVX512-NEXT: setb %r9b ; AVX512-NEXT: cmpq %r10, %rax -; AVX512-NEXT: sbbq %rbx, %r11 -; AVX512-NEXT: sbbb $0, %r8b +; AVX512-NEXT: sbbq %r14, %r11 +; AVX512-NEXT: sbbb $0, %r9b ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: cmpq %rbx, %r11 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: cmpq %r14, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX512-NEXT: movq %r12, %r10 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %r10 ; AVX512-NEXT: setb %r10b -; AVX512-NEXT: cmpq %r11, %rbx -; AVX512-NEXT: sbbq %r14, %rax +; AVX512-NEXT: cmpq %r11, %r14 +; AVX512-NEXT: sbbq %r12, %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: sbbb $0, %r10b -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: cmpq %r15, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: cmpq %r13, %r11 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: sbbq %r14, %rbx -; AVX512-NEXT: setb %bl -; AVX512-NEXT: cmpq %r11, %r15 +; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX512-NEXT: sbbq %r12, %r14 +; AVX512-NEXT: setb %bpl +; AVX512-NEXT: cmpq %r11, %r13 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: sbbq %rax, %r14 +; AVX512-NEXT: sbbq %rax, %r12 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512-NEXT: sbbb $0, %bl +; AVX512-NEXT: sbbb $0, %bpl ; AVX512-NEXT: cmpq %r11, %r14 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: sbbq %r12, %r15 -; AVX512-NEXT: setb %r15b +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: sbbq %r13, %r12 +; AVX512-NEXT: setb %r12b ; AVX512-NEXT: cmpq %r14, %r11 -; AVX512-NEXT: sbbq %rax, %r12 -; AVX512-NEXT: sbbb $0, %r15b -; AVX512-NEXT: movzbl %r15b, %r11d +; AVX512-NEXT: sbbq %rax, %r13 +; AVX512-NEXT: sbbb $0, %r12b +; AVX512-NEXT: movzbl %r12b, %r11d ; AVX512-NEXT: andl $3, %r11d ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; AVX512-NEXT: movb %r11b, 4(%r14) -; AVX512-NEXT: movzbl %bl, %r11d +; AVX512-NEXT: movzbl %bpl, %r11d ; AVX512-NEXT: andl $3, %r11d ; AVX512-NEXT: movzbl %r10b, %r10d ; AVX512-NEXT: andl $3, %r10d ; AVX512-NEXT: leaq (%r10,%r11,4), %r10 +; AVX512-NEXT: movzbl %r9b, %r9d +; AVX512-NEXT: andl $3, %r9d +; AVX512-NEXT: shll $4, %r9d +; AVX512-NEXT: orq %r10, %r9 ; AVX512-NEXT: movzbl %r8b, %r8d ; AVX512-NEXT: andl $3, %r8d -; AVX512-NEXT: shll $4, %r8d -; AVX512-NEXT: orq %r10, %r8 +; AVX512-NEXT: shll $6, %r8d +; AVX512-NEXT: orq %r9, %r8 ; AVX512-NEXT: movzbl %dil, %edi ; AVX512-NEXT: andl $3, %edi -; AVX512-NEXT: shll $6, %edi +; AVX512-NEXT: shll $8, %edi ; AVX512-NEXT: orq %r8, %rdi ; AVX512-NEXT: movzbl %sil, %esi ; AVX512-NEXT: andl $3, %esi -; AVX512-NEXT: shll $8, %esi -; AVX512-NEXT: orq %rdi, %rsi +; AVX512-NEXT: shll $10, %esi ; AVX512-NEXT: movzbl %dl, %edx ; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $10, %edx +; AVX512-NEXT: shll $12, %edx +; AVX512-NEXT: orq %rsi, %rdx ; AVX512-NEXT: movzbl %cl, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: shll $12, %ecx +; AVX512-NEXT: shll $14, %ecx ; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: movzbl %r9b, %edx -; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $14, %edx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: movzbl %bl, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: shll $16, %eax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: movzbl %r13b, %ecx +; AVX512-NEXT: orq %rcx, %rax +; AVX512-NEXT: orq %rdi, %rax +; AVX512-NEXT: movzbl %r15b, %ecx ; AVX512-NEXT: andl $3, %ecx ; AVX512-NEXT: shll $18, %ecx ; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload @@ -2963,7 +2972,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $132, %esp +; X86-NEXT: subl $128, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3054,31 +3063,47 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: andl $127, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl $127, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl $127, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: andl $127, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: andl $127, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl $127, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp ; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %ebx, %edi ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %cl @@ -3091,6 +3116,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi @@ -3243,26 +3269,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: sbbb $0, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -3387,8 +3393,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3398,7 +3404,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sbbl %edi, %esi ; X86-NEXT: sbbl %edx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: sbbl %eax, %eax @@ -3469,7 +3475,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: orl %eax, %edx ; X86-NEXT: movl %edx, (%edi) ; X86-NEXT: movl %edi, %eax -; X86-NEXT: addl $132, %esp +; X86-NEXT: addl $128, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx