diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a7e4bc7401477..6d4c067ed0f6e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27514,7 +27514,6 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { return false; } -// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? // TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); @@ -27522,7 +27521,7 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { bool NoImplicitFloatOps = SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE1()) return false; return needsCmpXchgNb(MemType); @@ -28281,16 +28280,18 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, return Op; if (VT == MVT::i64 && !IsTypeLegal) { - // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. - // FIXME: Use movlps with SSE1. + // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE + // is enabled. // FIXME: Use fist with X87. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && - Subtarget.hasSSE2()) { + Subtarget.hasSSE1()) { SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getOperand(2)); + MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; + SclToVec = DAG.getBitcast(StVT, SclToVec); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, @@ -42370,6 +42371,28 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + auto *St = cast(N); + + SDValue StoredVal = N->getOperand(1); + MVT VT = StoredVal.getSimpleValueType(); + EVT MemVT = St->getMemoryVT(); + + // Figure out which elements we demand. + unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits(); + APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements @@ -46751,6 +46774,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); + case X86ISD::VEXTRACT_STORE: + return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll index 2d5417b11e86a..24950b53fb416 100644 --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -114,33 +114,23 @@ define void @fadd_64r(double* %loc, double %val) nounwind { ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: pushl %ebx -; X86-SSE1-NEXT: pushl %esi ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $24, %esp -; X86-SSE1-NEXT: movl 8(%ebp), %esi -; X86-SSE1-NEXT: fildll (%esi) +; X86-SSE1-NEXT: movl 8(%ebp), %eax +; X86-SSE1-NEXT: fildll (%eax) ; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) ; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movl (%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%esi), %eax -; X86-SSE1-NEXT: movl 4(%esi), %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB1_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b (%esi) -; X86-SSE1-NEXT: jne .LBB1_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: leal -8(%ebp), %esp -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl ; @@ -298,9 +288,8 @@ define void @fadd_64g() nounwind { ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: pushl %ebx ; X86-SSE1-NEXT: andl $-8, %esp -; X86-SSE1-NEXT: subl $32, %esp +; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: fildll glob64 ; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -310,18 +299,11 @@ define void @fadd_64g() nounwind { ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movl (%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl glob64+4, %edx -; X86-SSE1-NEXT: movl glob64, %eax -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB3_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b glob64 -; X86-SSE1-NEXT: jne .LBB3_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: leal -4(%ebp), %esp -; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl ; @@ -477,9 +459,8 @@ define void @fadd_64imm() nounwind { ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: pushl %ebx ; X86-SSE1-NEXT: andl $-8, %esp -; X86-SSE1-NEXT: subl $32, %esp +; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: fildll -559038737 ; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -489,18 +470,11 @@ define void @fadd_64imm() nounwind { ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movl (%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl -559038737, %eax -; X86-SSE1-NEXT: movl -559038733, %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB5_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b -559038737 -; X86-SSE1-NEXT: jne .LBB5_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: leal -4(%ebp), %esp -; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl ; @@ -662,10 +636,9 @@ define void @fadd_64stack() nounwind { ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: pushl %ebx ; X86-SSE1-NEXT: andl $-8, %esp -; X86-SSE1-NEXT: subl $40, %esp -; X86-SSE1-NEXT: fildll (%esp) +; X86-SSE1-NEXT: subl $32, %esp +; X86-SSE1-NEXT: fildll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -673,19 +646,12 @@ define void @fadd_64stack() nounwind { ; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%esp), %eax -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB7_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b (%esp) -; X86-SSE1-NEXT: jne .LBB7_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: leal -4(%ebp), %esp -; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: fstpl (%esp) +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl ; @@ -784,36 +750,26 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1: # %bb.0: # %bb ; X86-SSE1-NEXT: pushl %ebp ; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: pushl %ebx -; X86-SSE1-NEXT: pushl %edi ; X86-SSE1-NEXT: pushl %esi ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $32, %esp -; X86-SSE1-NEXT: movl 20(%ebp), %esi -; X86-SSE1-NEXT: movl 8(%ebp), %edi -; X86-SSE1-NEXT: fildll (%edi,%esi,8) +; X86-SSE1-NEXT: movl 20(%ebp), %eax +; X86-SSE1-NEXT: movl 8(%ebp), %ecx +; X86-SSE1-NEXT: fildll (%ecx,%eax,8) ; X86-SSE1-NEXT: fistpll {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) ; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movl (%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%edi,%esi,8), %eax -; X86-SSE1-NEXT: movl 4(%edi,%esi,8), %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB8_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b (%edi,%esi,8) -; X86-SSE1-NEXT: jne .LBB8_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: leal -12(%ebp), %esp +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE1-NEXT: leal -4(%ebp), %esp ; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: popl %edi -; X86-SSE1-NEXT: popl %ebx ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index d3aae068dd606..ef31b2758dfea 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -55,27 +55,11 @@ define void @store_float(float* %fptr, float %v) { define void @store_double(double* %fptr, double %v) { ; X86-SSE1-LABEL: store_double: ; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl %ebx -; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X86-SSE1-NEXT: .cfi_offset %esi, -12 -; X86-SSE1-NEXT: .cfi_offset %ebx, -8 -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%esi), %eax -; X86-SSE1-NEXT: movl 4(%esi), %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB2_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b (%esi) -; X86-SSE1-NEXT: jne .LBB2_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE1-NEXT: popl %ebx -; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, (%eax) ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: store_double: @@ -568,27 +552,12 @@ define void @store_float_seq_cst(float* %fptr, float %v) { define void @store_double_seq_cst(double* %fptr, double %v) { ; X86-SSE1-LABEL: store_double_seq_cst: ; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl %ebx -; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X86-SSE1-NEXT: .cfi_offset %esi, -12 -; X86-SSE1-NEXT: .cfi_offset %ebx, -8 -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%esi), %eax -; X86-SSE1-NEXT: movl 4(%esi), %edx -; X86-SSE1-NEXT: .p2align 4, 0x90 -; X86-SSE1-NEXT: .LBB9_1: # %atomicrmw.start -; X86-SSE1-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SSE1-NEXT: lock cmpxchg8b (%esi) -; X86-SSE1-NEXT: jne .LBB9_1 -; X86-SSE1-NEXT: # %bb.2: # %atomicrmw.end -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE1-NEXT: popl %ebx -; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: lock orl $0, (%esp) ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: store_double_seq_cst: