-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Use plain load/store instead of cmpxchg16b for atomics with AVX #74275
Changes from 3 commits
7baffd6
db3abe6
45ffe10
b4decd9
0632fc2
b1d6607
bed731e
80257f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -508,6 +508,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |
if (!Subtarget.is64Bit()) | ||
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); | ||
|
||
if (Subtarget.is64Bit() && Subtarget.hasAVX()) { | ||
// All CPUs supporting AVX will atomically load/store aligned 128-bit | ||
// values, so we can emit [V]MOVAPS/[V]MOVDQA. | ||
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); | ||
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); | ||
} | ||
|
||
if (Subtarget.canUseCMPXCHG16B()) | ||
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); | ||
|
||
|
@@ -30095,12 +30102,16 @@ TargetLoweringBase::AtomicExpansionKind | |
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { | ||
Type *MemType = SI->getValueOperand()->getType(); | ||
|
||
bool NoImplicitFloatOps = | ||
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); | ||
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | ||
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && | ||
(Subtarget.hasSSE1() || Subtarget.hasX87())) | ||
return AtomicExpansionKind::None; | ||
if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && | ||
!Subtarget.useSoftFloat()) { | ||
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | ||
(Subtarget.hasSSE1() || Subtarget.hasX87())) | ||
return AtomicExpansionKind::None; | ||
|
||
if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && | ||
Subtarget.hasAVX()) | ||
return AtomicExpansionKind::None; | ||
} | ||
|
||
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand | ||
: AtomicExpansionKind::None; | ||
|
@@ -30115,12 +30126,16 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { | |
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we | ||
// can use movq to do the load. If we have X87 we can load into an 80-bit | ||
// X87 register and store it to a stack temporary. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move the comment down to the case it applies to (inside the outer if). Currently, it binds to both cases which is misleading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
bool NoImplicitFloatOps = | ||
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); | ||
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | ||
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && | ||
(Subtarget.hasSSE1() || Subtarget.hasX87())) | ||
return AtomicExpansionKind::None; | ||
if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && | ||
!Subtarget.useSoftFloat()) { | ||
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | ||
(Subtarget.hasSSE1() || Subtarget.hasX87())) | ||
return AtomicExpansionKind::None; | ||
|
||
if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && | ||
Subtarget.hasAVX()) | ||
return AtomicExpansionKind::None; | ||
} | ||
|
||
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg | ||
: AtomicExpansionKind::None; | ||
|
@@ -31259,14 +31274,23 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, | |
if (!IsSeqCst && IsTypeLegal) | ||
return Op; | ||
|
||
if (VT == MVT::i64 && !IsTypeLegal) { | ||
if (!IsTypeLegal && !Subtarget.useSoftFloat() && | ||
!DAG.getMachineFunction().getFunction().hasFnAttribute( | ||
Attribute::NoImplicitFloat)) { | ||
SDValue Chain; | ||
// For illegal i128 atomic_store, when AVX is enabled, we can simply emit a | ||
// vector store. | ||
if (VT == MVT::i128) { | ||
if (Subtarget.is64Bit() && Subtarget.hasAVX()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like you can collapse one level of if clause here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal()); | ||
Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(), | ||
Node->getMemOperand()); | ||
} | ||
} | ||
|
||
// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE | ||
// is enabled. | ||
bool NoImplicitFloatOps = | ||
DAG.getMachineFunction().getFunction().hasFnAttribute( | ||
Attribute::NoImplicitFloat); | ||
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { | ||
SDValue Chain; | ||
if (VT == MVT::i64) { | ||
if (Subtarget.hasSSE1()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has an "else if" attached, so better not to. |
||
SDValue SclToVec = | ||
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal()); | ||
|
@@ -31298,15 +31322,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, | |
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), | ||
StoreOps, MVT::i64, Node->getMemOperand()); | ||
} | ||
} | ||
|
||
if (Chain) { | ||
// If this is a sequentially consistent store, also emit an appropriate | ||
// barrier. | ||
if (IsSeqCst) | ||
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); | ||
if (Chain) { | ||
// If this is a sequentially consistent store, also emit an appropriate | ||
// barrier. | ||
if (IsSeqCst) | ||
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); | ||
|
||
return Chain; | ||
} | ||
return Chain; | ||
} | ||
} | ||
|
||
|
@@ -32859,12 +32883,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, | |
return; | ||
} | ||
case ISD::ATOMIC_LOAD: { | ||
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); | ||
assert( | ||
(N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) && | ||
"Unexpected VT!"); | ||
bool NoImplicitFloatOps = | ||
DAG.getMachineFunction().getFunction().hasFnAttribute( | ||
Attribute::NoImplicitFloat); | ||
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { | ||
auto *Node = cast<AtomicSDNode>(N); | ||
|
||
if (N->getValueType(0) == MVT::i128) { | ||
if (Subtarget.is64Bit() && Subtarget.hasAVX()) { | ||
SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(), | ||
Node->getBasePtr(), Node->getMemOperand()); | ||
SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, | ||
DAG.getIntPtrConstant(0, dl)); | ||
SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, | ||
DAG.getIntPtrConstant(1, dl)); | ||
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), | ||
{ResL, ResH})); | ||
Results.push_back(Ld.getValue(1)); | ||
return; | ||
} | ||
break; | ||
} | ||
if (Subtarget.hasSSE1()) { | ||
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS. | ||
// Then extract the lower 64-bits. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,87 +228,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) { | |
} | ||
|
||
define i128 @load_i128(ptr %ptr) { | ||
; CHECK-O0-LABEL: load_i128: | ||
; CHECK-O0: # %bb.0: | ||
; CHECK-O0-NEXT: pushq %rbx | ||
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O0-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O0-NEXT: xorl %eax, %eax | ||
; CHECK-O0-NEXT: movl %eax, %ebx | ||
; CHECK-O0-NEXT: movq %rbx, %rax | ||
; CHECK-O0-NEXT: movq %rbx, %rdx | ||
; CHECK-O0-NEXT: movq %rbx, %rcx | ||
; CHECK-O0-NEXT: lock cmpxchg16b (%rdi) | ||
; CHECK-O0-NEXT: popq %rbx | ||
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O0-NEXT: retq | ||
; | ||
; CHECK-O3-LABEL: load_i128: | ||
; CHECK-O3: # %bb.0: | ||
; CHECK-O3-NEXT: pushq %rbx | ||
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O3-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O3-NEXT: xorl %eax, %eax | ||
; CHECK-O3-NEXT: xorl %edx, %edx | ||
; CHECK-O3-NEXT: xorl %ecx, %ecx | ||
; CHECK-O3-NEXT: xorl %ebx, %ebx | ||
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi) | ||
; CHECK-O3-NEXT: popq %rbx | ||
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O3-NEXT: retq | ||
; CHECK-O0-CUR-LABEL: load_i128: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your tests need updated, these check prefixes no longer exist. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
; CHECK-O0-CUR: # %bb.0: | ||
; CHECK-O0-CUR-NEXT: vmovdqa (%rdi), %xmm0 | ||
; CHECK-O0-CUR-NEXT: vmovq %xmm0, %rax | ||
; CHECK-O0-CUR-NEXT: vpextrq $1, %xmm0, %rdx | ||
; CHECK-O0-CUR-NEXT: retq | ||
; | ||
; CHECK-O3-CUR-LABEL: load_i128: | ||
; CHECK-O3-CUR: # %bb.0: | ||
; CHECK-O3-CUR-NEXT: vmovdqa (%rdi), %xmm0 | ||
; CHECK-O3-CUR-NEXT: vmovq %xmm0, %rax | ||
; CHECK-O3-CUR-NEXT: vpextrq $1, %xmm0, %rdx | ||
; CHECK-O3-CUR-NEXT: retq | ||
; | ||
; CHECK-O0-EX-LABEL: load_i128: | ||
; CHECK-O0-EX: # %bb.0: | ||
; CHECK-O0-EX-NEXT: pushq %rbx | ||
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O0-EX-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O0-EX-NEXT: xorl %eax, %eax | ||
; CHECK-O0-EX-NEXT: movl %eax, %ebx | ||
; CHECK-O0-EX-NEXT: movq %rbx, %rax | ||
; CHECK-O0-EX-NEXT: movq %rbx, %rdx | ||
; CHECK-O0-EX-NEXT: movq %rbx, %rcx | ||
; CHECK-O0-EX-NEXT: lock cmpxchg16b (%rdi) | ||
; CHECK-O0-EX-NEXT: popq %rbx | ||
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O0-EX-NEXT: retq | ||
; | ||
; CHECK-O3-EX-LABEL: load_i128: | ||
; CHECK-O3-EX: # %bb.0: | ||
; CHECK-O3-EX-NEXT: pushq %rbx | ||
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O3-EX-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O3-EX-NEXT: xorl %eax, %eax | ||
; CHECK-O3-EX-NEXT: xorl %edx, %edx | ||
; CHECK-O3-EX-NEXT: xorl %ecx, %ecx | ||
; CHECK-O3-EX-NEXT: xorl %ebx, %ebx | ||
; CHECK-O3-EX-NEXT: lock cmpxchg16b (%rdi) | ||
; CHECK-O3-EX-NEXT: popq %rbx | ||
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O3-EX-NEXT: retq | ||
%v = load atomic i128, ptr %ptr unordered, align 16 | ||
ret i128 %v | ||
} | ||
|
||
define void @store_i128(ptr %ptr, i128 %v) { | ||
; CHECK-O0-LABEL: store_i128: | ||
; CHECK-O0: # %bb.0: | ||
; CHECK-O0-NEXT: pushq %rbx | ||
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O0-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: movq (%rdi), %rax | ||
; CHECK-O0-NEXT: movq 8(%rdi), %rdx | ||
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: jmp .LBB16_1 | ||
; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start | ||
; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1 | ||
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload | ||
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload | ||
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload | ||
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload | ||
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload | ||
; CHECK-O0-NEXT: lock cmpxchg16b (%rsi) | ||
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill | ||
; CHECK-O0-NEXT: jne .LBB16_1 | ||
; CHECK-O0-NEXT: jmp .LBB16_2 | ||
; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end | ||
; CHECK-O0-NEXT: popq %rbx | ||
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O0-NEXT: retq | ||
; | ||
; CHECK-O3-LABEL: store_i128: | ||
; CHECK-O3: # %bb.0: | ||
; CHECK-O3-NEXT: pushq %rbx | ||
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O3-NEXT: .cfi_offset %rbx, -16 | ||
; CHECK-O3-NEXT: movq %rdx, %rcx | ||
; CHECK-O3-NEXT: movq %rsi, %rbx | ||
; CHECK-O3-NEXT: movq (%rdi), %rax | ||
; CHECK-O3-NEXT: movq 8(%rdi), %rdx | ||
; CHECK-O3-NEXT: .p2align 4, 0x90 | ||
; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start | ||
; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1 | ||
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi) | ||
; CHECK-O3-NEXT: jne .LBB16_1 | ||
; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end | ||
; CHECK-O3-NEXT: popq %rbx | ||
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O3-NEXT: retq | ||
; CHECK-O0-CUR-LABEL: store_i128: | ||
; CHECK-O0-CUR: # %bb.0: | ||
; CHECK-O0-CUR-NEXT: vmovq %rsi, %xmm0 | ||
; CHECK-O0-CUR-NEXT: vmovq %rdx, %xmm1 | ||
; CHECK-O0-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] | ||
; CHECK-O0-CUR-NEXT: vmovdqa %xmm0, (%rdi) | ||
; CHECK-O0-CUR-NEXT: retq | ||
; | ||
; CHECK-O3-CUR-LABEL: store_i128: | ||
; CHECK-O3-CUR: # %bb.0: | ||
; CHECK-O3-CUR-NEXT: vmovq %rdx, %xmm0 | ||
; CHECK-O3-CUR-NEXT: vmovq %rsi, %xmm1 | ||
; CHECK-O3-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] | ||
; CHECK-O3-CUR-NEXT: vmovdqa %xmm0, (%rdi) | ||
; CHECK-O3-CUR-NEXT: retq | ||
; | ||
; CHECK-O0-EX-LABEL: store_i128: | ||
; CHECK-O0-EX: # %bb.0: | ||
; CHECK-O0-EX-NEXT: pushq %rax | ||
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O0-EX-NEXT: callq __sync_lock_test_and_set_16@PLT | ||
; CHECK-O0-EX-NEXT: popq %rax | ||
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O0-EX-NEXT: retq | ||
; | ||
; CHECK-O3-EX-LABEL: store_i128: | ||
; CHECK-O3-EX: # %bb.0: | ||
; CHECK-O3-EX-NEXT: pushq %rax | ||
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-O3-EX-NEXT: callq __sync_lock_test_and_set_16@PLT | ||
; CHECK-O3-EX-NEXT: popq %rax | ||
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-O3-EX-NEXT: retq | ||
store atomic i128 %v, ptr %ptr unordered, align 16 | ||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to check the alignment here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, misaligned atomic ops are converted to
__atomic_*
libcall before this function is ever called.