Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[X86] Use plain load/store instead of cmpxchg16b for atomics with AVX #74275

Merged
merged 8 commits into from
May 16, 2024
100 changes: 70 additions & 30 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
// All CPUs supporting AVX will atomically load/store aligned 128-bit
// values, so we can emit [V]MOVAPS/[V]MOVDQA.
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
}

if (Subtarget.canUseCMPXCHG16B())
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

Expand Down Expand Up @@ -30415,32 +30422,40 @@ TargetLoweringBase::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();

bool NoImplicitFloatOps =
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
!Subtarget.useSoftFloat()) {
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;

if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
Subtarget.hasAVX())
return AtomicExpansionKind::None;
}

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
: AtomicExpansionKind::None;
}

// Note: this turns large loads into lock cmpxchg8b/16b.
// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
Type *MemType = LI->getType();

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
bool NoImplicitFloatOps =
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
!Subtarget.useSoftFloat()) {
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;

// If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
Subtarget.hasAVX())
return AtomicExpansionKind::None;
}

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
Expand Down Expand Up @@ -31683,14 +31698,21 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
if (!IsSeqCst && IsTypeLegal)
return Op;

if (VT == MVT::i64 && !IsTypeLegal) {
if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
!DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat)) {
SDValue Chain;
// For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
// vector store.
if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
Node->getMemOperand());
}

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
// is enabled.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
SDValue Chain;
if (VT == MVT::i64) {
if (Subtarget.hasSSE1()) {
SDValue SclToVec =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
Expand Down Expand Up @@ -31722,15 +31744,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
StoreOps, MVT::i64, Node->getMemOperand());
}
}

if (Chain) {
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
if (Chain) {
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

return Chain;
}
return Chain;
}
}

Expand Down Expand Up @@ -33303,12 +33325,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ATOMIC_LOAD: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
assert(
(N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
"Unexpected VT!");
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);

if (N->getValueType(0) == MVT::i128) {
if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
Node->getBasePtr(), Node->getMemOperand());
SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(0, dl));
SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(1, dl));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
{ResL, ResH}));
Results.push_back(Ld.getValue(1));
return;
}
break;
}
if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// Then extract the lower 64-bits.
Expand Down
31 changes: 3 additions & 28 deletions llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
;
; X64-AVX-LABEL: store_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
; X64-AVX-NEXT: .cfi_offset %rbx, -16
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-AVX-NEXT: movq (%rdi), %rax
; X64-AVX-NEXT: movq 8(%rdi), %rdx
; X64-AVX-NEXT: .p2align 4, 0x90
; X64-AVX-NEXT: .LBB0_1: # %atomicrmw.start
; X64-AVX-NEXT: # =>This Inner Loop Header: Depth=1
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
; X64-AVX-NEXT: jne .LBB0_1
; X64-AVX-NEXT: # %bb.2: # %atomicrmw.end
; X64-AVX-NEXT: popq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
store atomic fp128 %v, ptr %fptr unordered, align 16
ret void
Expand All @@ -69,19 +54,9 @@ define fp128 @load_fp128(ptr %fptr) {
;
; X64-AVX-LABEL: load_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
; X64-AVX-NEXT: .cfi_offset %rbx, -16
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: xorl %edx, %edx
; X64-AVX-NEXT: xorl %ecx, %ecx
; X64-AVX-NEXT: xorl %ebx, %ebx
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
; X64-AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; X64-AVX-NEXT: popq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: retq
%v = load atomic fp128, ptr %fptr unordered, align 16
ret fp128 %v
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/X86/atomic-non-integer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {
ret void
}


define half @load_half(ptr %fptr) {
; X86-SSE1-LABEL: load_half:
; X86-SSE1: # %bb.0:
Expand Down
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/X86/atomic-unaligned.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
; RUN: llc -mtriple=x86_64 < %s | FileCheck %s

; Quick test to ensure that atomics which are not naturally-aligned
; emit unsized libcalls, and aren't emitted as native instructions or
; sized libcalls.
define void @test_i32(ptr %a) nounwind {
; CHECK-LABEL: test_i32:
; CHECK: callq __atomic_load
; CHECK: callq __atomic_store
; CHECK: callq __atomic_exchange
; CHECK: callq __atomic_compare_exchange
; CHECK: callq __atomic_compare_exchange
%t0 = load atomic i32, ptr %a seq_cst, align 2
store atomic i32 1, ptr %a seq_cst, align 2
%t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
%t3 = atomicrmw add ptr %a, i32 2 seq_cst, align 2
%t2 = cmpxchg ptr %a, i32 0, i32 1 seq_cst seq_cst, align 2
ret void
}

define void @test_i128(ptr %a) nounwind {
; CHECK-LABEL: test_i128:
; CHECK: callq __atomic_load
; CHECK: callq __atomic_store
; CHECK: callq __atomic_exchange
; CHECK: callq __atomic_compare_exchange
%t0 = load atomic i128, ptr %a seq_cst, align 8
store atomic i128 1, ptr %a seq_cst, align 8
%t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
%t2 = atomicrmw add ptr %a, i128 2 seq_cst, align 8
%t3 = cmpxchg ptr %a, i128 0, i128 1 seq_cst seq_cst, align 8
ret void
}
83 changes: 14 additions & 69 deletions llvm/test/CodeGen/X86/atomic-unordered.ll
Original file line number Diff line number Diff line change
Expand Up @@ -228,86 +228,31 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
}

define i128 @load_i128(ptr %ptr) {
; CHECK-O0-LABEL: load_i128:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
; CHECK-O0-NEXT: xorl %eax, %eax
; CHECK-O0-NEXT: movl %eax, %ebx
; CHECK-O0-NEXT: movq %rbx, %rax
; CHECK-O0-NEXT: movq %rbx, %rdx
; CHECK-O0-NEXT: movq %rbx, %rcx
; CHECK-O0-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O0-NEXT: popq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: load_i128:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: pushq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
; CHECK-O3-NEXT: xorl %eax, %eax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: xorl %ecx, %ecx
; CHECK-O3-NEXT: xorl %ebx, %ebx
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O3-NEXT: popq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: retq
; CHECK-LABEL: load_i128:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
; CHECK-NEXT: retq
%v = load atomic i128, ptr %ptr unordered, align 16
ret i128 %v
}

define void @store_i128(ptr %ptr, i128 %v) {
; CHECK-O0-LABEL: store_i128:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq (%rdi), %rax
; CHECK-O0-NEXT: movq 8(%rdi), %rdx
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: jmp .LBB16_1
; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start
; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-O0-NEXT: lock cmpxchg16b (%rsi)
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-O0-NEXT: jne .LBB16_1
; CHECK-O0-NEXT: jmp .LBB16_2
; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end
; CHECK-O0-NEXT: popq %rbx
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
; CHECK-O0-NEXT: vmovq %rsi, %xmm0
; CHECK-O0-NEXT: vmovq %rdx, %xmm1
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: store_i128:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: pushq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
; CHECK-O3-NEXT: movq %rdx, %rcx
; CHECK-O3-NEXT: movq %rsi, %rbx
; CHECK-O3-NEXT: movq (%rdi), %rax
; CHECK-O3-NEXT: movq 8(%rdi), %rdx
; CHECK-O3-NEXT: .p2align 4, 0x90
; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start
; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
; CHECK-O3-NEXT: jne .LBB16_1
; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end
; CHECK-O3-NEXT: popq %rbx
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
; CHECK-O3-NEXT: vmovq %rdx, %xmm0
; CHECK-O3-NEXT: vmovq %rsi, %xmm1
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-O3-NEXT: retq
store atomic i128 %v, ptr %ptr unordered, align 16
ret void
Expand Down
Loading
Loading