Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[X86] Use plain load/store instead of cmpxchg16b for atomics with AVX #74275

Merged
merged 8 commits into from
May 16, 2024
94 changes: 68 additions & 26 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
// All CPUs supporting AVX will atomically load/store aligned 128-bit
// values, so we can emit [V]MOVAPS/[V]MOVDQA.
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
}

if (Subtarget.canUseCMPXCHG16B())
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

Expand Down Expand Up @@ -30095,12 +30102,16 @@ TargetLoweringBase::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();

bool NoImplicitFloatOps =
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
!Subtarget.useSoftFloat()) {
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check the alignment here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, misaligned atomic ops are converted to __atomic_* libcall before this function is ever called.

return AtomicExpansionKind::None;

if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
Subtarget.hasAVX())
return AtomicExpansionKind::None;
}

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
: AtomicExpansionKind::None;
Expand All @@ -30115,12 +30126,16 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move the comment down to the case it applies to (inside the outer if). Currently, it binds to both cases which is misleading.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

bool NoImplicitFloatOps =
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
!Subtarget.useSoftFloat()) {
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;

if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
Subtarget.hasAVX())
return AtomicExpansionKind::None;
}

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
Expand Down Expand Up @@ -31259,14 +31274,23 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
if (!IsSeqCst && IsTypeLegal)
return Op;

if (VT == MVT::i64 && !IsTypeLegal) {
if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
!DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat)) {
SDValue Chain;
// For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
// vector store.
if (VT == MVT::i128) {
if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like you can collapse one level of if clause here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
Node->getMemOperand());
}
}

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
// is enabled.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
SDValue Chain;
if (VT == MVT::i64) {
if (Subtarget.hasSSE1()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has an "else if" attached, so better not to.

SDValue SclToVec =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
Expand Down Expand Up @@ -31298,15 +31322,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
StoreOps, MVT::i64, Node->getMemOperand());
}
}

if (Chain) {
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
if (Chain) {
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

return Chain;
}
return Chain;
}
}

Expand Down Expand Up @@ -32859,12 +32883,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ATOMIC_LOAD: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
assert(
(N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
"Unexpected VT!");
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);

if (N->getValueType(0) == MVT::i128) {
if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
Node->getBasePtr(), Node->getMemOperand());
SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(0, dl));
SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(1, dl));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
{ResL, ResH}));
Results.push_back(Ld.getValue(1));
return;
}
break;
}
if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// Then extract the lower 64-bits.
Expand Down
31 changes: 3 additions & 28 deletions llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
;
; X64-AVX-LABEL: store_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
; X64-AVX-NEXT: .cfi_offset %rbx, -16
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-AVX-NEXT: movq (%rdi), %rax
; X64-AVX-NEXT: movq 8(%rdi), %rdx
; X64-AVX-NEXT: .p2align 4, 0x90
; X64-AVX-NEXT: .LBB0_1: # %atomicrmw.start
; X64-AVX-NEXT: # =>This Inner Loop Header: Depth=1
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
; X64-AVX-NEXT: jne .LBB0_1
; X64-AVX-NEXT: # %bb.2: # %atomicrmw.end
; X64-AVX-NEXT: popq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
store atomic fp128 %v, ptr %fptr unordered, align 16
ret void
Expand All @@ -69,19 +54,9 @@ define fp128 @load_fp128(ptr %fptr) {
;
; X64-AVX-LABEL: load_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
; X64-AVX-NEXT: .cfi_offset %rbx, -16
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: xorl %edx, %edx
; X64-AVX-NEXT: xorl %ecx, %ecx
; X64-AVX-NEXT: xorl %ebx, %ebx
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
; X64-AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; X64-AVX-NEXT: popq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: retq
%v = load atomic fp128, ptr %fptr unordered, align 16
ret fp128 %v
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/X86/atomic-non-integer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {
ret void
}


define half @load_half(ptr %fptr) {
; X86-SSE1-LABEL: load_half:
; X86-SSE1: # %bb.0:
Expand Down
151 changes: 75 additions & 76 deletions llvm/test/CodeGen/X86/atomic-unordered.ll
Original file line number Diff line number Diff line change
Expand Up @@ -228,87 +228,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
}

define i128 @load_i128(ptr %ptr) {
; CHECK-O0-LABEL: load_i128:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
; CHECK-O0-NEXT: xorl %eax, %eax
; CHECK-O0-NEXT: movl %eax, %ebx
; CHECK-O0-NEXT: movq %rbx, %rax
; CHECK-O0-NEXT: movq %rbx, %rdx
; CHECK-O0-NEXT: movq %rbx, %rcx
; CHECK-O0-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O0-NEXT: popq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: load_i128:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: pushq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
; CHECK-O3-NEXT: xorl %eax, %eax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: xorl %ecx, %ecx
; CHECK-O3-NEXT: xorl %ebx, %ebx
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O3-NEXT: popq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: retq
; CHECK-O0-CUR-LABEL: load_i128:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your tests need updated, these check prefixes no longer exist.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

; CHECK-O0-CUR: # %bb.0:
; CHECK-O0-CUR-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-O0-CUR-NEXT: vmovq %xmm0, %rax
; CHECK-O0-CUR-NEXT: vpextrq $1, %xmm0, %rdx
; CHECK-O0-CUR-NEXT: retq
;
; CHECK-O3-CUR-LABEL: load_i128:
; CHECK-O3-CUR: # %bb.0:
; CHECK-O3-CUR-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-O3-CUR-NEXT: vmovq %xmm0, %rax
; CHECK-O3-CUR-NEXT: vpextrq $1, %xmm0, %rdx
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O0-EX-LABEL: load_i128:
; CHECK-O0-EX: # %bb.0:
; CHECK-O0-EX-NEXT: pushq %rbx
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-EX-NEXT: .cfi_offset %rbx, -16
; CHECK-O0-EX-NEXT: xorl %eax, %eax
; CHECK-O0-EX-NEXT: movl %eax, %ebx
; CHECK-O0-EX-NEXT: movq %rbx, %rax
; CHECK-O0-EX-NEXT: movq %rbx, %rdx
; CHECK-O0-EX-NEXT: movq %rbx, %rcx
; CHECK-O0-EX-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O0-EX-NEXT: popq %rbx
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-EX-NEXT: retq
;
; CHECK-O3-EX-LABEL: load_i128:
; CHECK-O3-EX: # %bb.0:
; CHECK-O3-EX-NEXT: pushq %rbx
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-EX-NEXT: .cfi_offset %rbx, -16
; CHECK-O3-EX-NEXT: xorl %eax, %eax
; CHECK-O3-EX-NEXT: xorl %edx, %edx
; CHECK-O3-EX-NEXT: xorl %ecx, %ecx
; CHECK-O3-EX-NEXT: xorl %ebx, %ebx
; CHECK-O3-EX-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O3-EX-NEXT: popq %rbx
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-EX-NEXT: retq
%v = load atomic i128, ptr %ptr unordered, align 16
ret i128 %v
}

define void @store_i128(ptr %ptr, i128 %v) {
; CHECK-O0-LABEL: store_i128:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq (%rdi), %rax
; CHECK-O0-NEXT: movq 8(%rdi), %rdx
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: jmp .LBB16_1
; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start
; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-O0-NEXT: lock cmpxchg16b (%rsi)
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: jne .LBB16_1
; CHECK-O0-NEXT: jmp .LBB16_2
; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end
; CHECK-O0-NEXT: popq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: store_i128:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: pushq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
; CHECK-O3-NEXT: movq %rdx, %rcx
; CHECK-O3-NEXT: movq %rsi, %rbx
; CHECK-O3-NEXT: movq (%rdi), %rax
; CHECK-O3-NEXT: movq 8(%rdi), %rdx
; CHECK-O3-NEXT: .p2align 4, 0x90
; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start
; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O3-NEXT: jne .LBB16_1
; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end
; CHECK-O3-NEXT: popq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: retq
; CHECK-O0-CUR-LABEL: store_i128:
; CHECK-O0-CUR: # %bb.0:
; CHECK-O0-CUR-NEXT: vmovq %rsi, %xmm0
; CHECK-O0-CUR-NEXT: vmovq %rdx, %xmm1
; CHECK-O0-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-O0-CUR-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O0-CUR-NEXT: retq
;
; CHECK-O3-CUR-LABEL: store_i128:
; CHECK-O3-CUR: # %bb.0:
; CHECK-O3-CUR-NEXT: vmovq %rdx, %xmm0
; CHECK-O3-CUR-NEXT: vmovq %rsi, %xmm1
; CHECK-O3-CUR-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-O3-CUR-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O0-EX-LABEL: store_i128:
; CHECK-O0-EX: # %bb.0:
; CHECK-O0-EX-NEXT: pushq %rax
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-EX-NEXT: callq __sync_lock_test_and_set_16@PLT
; CHECK-O0-EX-NEXT: popq %rax
; CHECK-O0-EX-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-EX-NEXT: retq
;
; CHECK-O3-EX-LABEL: store_i128:
; CHECK-O3-EX: # %bb.0:
; CHECK-O3-EX-NEXT: pushq %rax
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-EX-NEXT: callq __sync_lock_test_and_set_16@PLT
; CHECK-O3-EX-NEXT: popq %rax
; CHECK-O3-EX-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-EX-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
}
Expand Down
Loading