-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[WoA] Remove extra barriers after ARM LSE instructions with MSVC #169596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
llvm@c9821ab added extra fences after sequentially consistent stores for compatibility with MSVC's seq_cst loads (ldr+dmb). These extra fences should not be needed for ARM LSE instructions that have both acquire+release semantics, which results in a two way barrier, and should be enough for sequential consistency. Fixes llvm#162345 Change-Id: I9148c73d0dcf3bf1b18a0915f96cac71ac1800f2
|
@llvm/pr-subscribers-backend-aarch64 Author: Usman Nadeem (UsmanNadeem) Changesc9821ab added extra fences after sequentially consistent stores for compatibility with MSVC's seq_cst loads (ldr+dmb). These extra fences should not be needed for ARM LSE instructions that have both acquire+release semantics, which results in a two way barrier, and should be enough for sequential consistency. Fixes #162345 Change-Id: I9148c73d0dcf3bf1b18a0915f96cac71ac1800f2 Patch is 811.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169596.diff 7 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7df5d8a09f0f6..d901fa2f20055 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2245,8 +2245,7 @@ class LLVM_ABI TargetLoweringBase {
/// Whether AtomicExpandPass should automatically insert a trailing fence
/// without reducing the ordering for this atomic. Defaults to false.
- virtual bool
- shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const {
+ virtual bool storeNeedsSeqCstTrailingFence(Instruction *I) const {
return false;
}
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d9bc042d6807e..aec008c570e2a 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -345,21 +345,13 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (FenceOrdering != AtomicOrdering::Monotonic) {
MadeChange |= bracketInstWithFences(I, FenceOrdering);
}
- } else if (I->hasAtomicStore() &&
- TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
- auto FenceOrdering = AtomicOrdering::Monotonic;
- if (SI)
- FenceOrdering = SI->getOrdering();
- else if (RMWI)
- FenceOrdering = RMWI->getOrdering();
- else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
- TargetLoweringBase::AtomicExpansionKind::LLSC)
- // LLSC is handled in expandAtomicCmpXchg().
- FenceOrdering = CASI->getSuccessOrdering();
-
+ } else if (TLI->storeNeedsSeqCstTrailingFence(I) &&
+ !(CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
+ TargetLoweringBase::AtomicExpansionKind::LLSC)) {
+ // CmpXchg LLSC is handled in expandAtomicCmpXchg().
IRBuilder Builder(I);
- if (auto TrailingFence =
- TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
+ if (auto TrailingFence = TLI->emitTrailingFence(
+ Builder, I, AtomicOrdering::SequentiallyConsistent)) {
TrailingFence->moveAfter(I);
MadeChange = true;
}
@@ -1511,8 +1503,7 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// Make sure later instructions don't get reordered with a fence if
// necessary.
Builder.SetInsertPoint(SuccessBB);
- if (ShouldInsertFencesForAtomic ||
- TLI->shouldInsertTrailingFenceForAtomicStore(CI))
+ if (ShouldInsertFencesForAtomic || TLI->storeNeedsSeqCstTrailingFence(CI))
TLI->emitTrailingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(ExitBB);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 83ce39fa314d1..a99413fe03431 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29446,8 +29446,8 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic(
return false;
}
-bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
- const Instruction *I) const {
+bool AArch64TargetLowering::storeNeedsSeqCstTrailingFence(
+ Instruction *I) const {
// Store-Release instructions only provide seq_cst guarantees when paired with
// Load-Acquire instructions. MSVC CRT does not use these instructions to
// implement seq_cst loads and stores, so we need additional explicit fences
@@ -29455,19 +29455,31 @@ bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return false;
- switch (I->getOpcode()) {
- default:
+ if (auto *SI = dyn_cast<StoreInst>(I))
+ return SI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+
+ auto *CAS = dyn_cast<AtomicCmpXchgInst>(I);
+ auto *RMW = dyn_cast<AtomicRMWInst>(I);
+ // Not a store.
+ if (!CAS && !RMW)
return false;
- case Instruction::AtomicCmpXchg:
- return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- case Instruction::AtomicRMW:
- return cast<AtomicRMWInst>(I)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- case Instruction::Store:
- return cast<StoreInst>(I)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- }
+
+ // Fence only needed for seq_cst.
+ if (CAS &&
+ CAS->getSuccessOrdering() != AtomicOrdering::SequentiallyConsistent)
+ return false;
+ if (RMW && RMW->getOrdering() != AtomicOrdering::SequentiallyConsistent)
+ return false;
+
+ // We do not need a fence only if we have LSE and are not expanding.
+ TargetLoweringBase::AtomicExpansionKind ExpandKind =
+ CAS ? shouldExpandAtomicCmpXchgInIR(CAS) : shouldExpandAtomicRMWInIR(RMW);
+ if (ExpandKind == AtomicExpansionKind::None && Subtarget->hasLSE())
+ return false;
+ if (RMW && ExpandKind == AtomicExpansionKind::CmpXChg && Subtarget->hasLSE())
+ return false;
+
+ return true;
}
// Loads and stores less than 128-bits are already atomic; ones above that
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ca08eb40c956a..8a99fcad212c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -349,8 +349,7 @@ class AArch64TargetLowering : public TargetLowering {
bool isOpSuitableForLSE128(const Instruction *I) const;
bool isOpSuitableForRCPC3(const Instruction *I) const;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
- bool
- shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override;
+ bool storeNeedsSeqCstTrailingFence(Instruction *I) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
index 70f3b5cc488ea..e784042ebb1d3 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -1,13 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix="CHECK-REG" --allow-unused-prefixes --implicit-check-not="stlxrb {{w|x}}[[NEW:[0-9]+]], {{w|x}}[[NEW:[0-9]+]]], [x{{[0-9]+}}]"
-; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
+; Point of implicit-check-not is to make sure UNPREDICTABLE instructions aren't created
; (i.e. reusing a register for status & data in store exclusive).
-; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
-; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]
+; CHECK-REG: {{.*}}
+
+; RUN: llc -mtriple=aarch64-windows-pc-msvc -disable-post-ra -verify-machineinstrs \
+; RUN: -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s --implicit-check-not="dmb"
+; RUN: llc -mtriple=aarch64-windows-pc-msvc -disable-post-ra -verify-machineinstrs \
+; RUN: -mattr=+lse -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s --implicit-check-not="dmb"
+; RUN: llc -mtriple=aarch64-windows-pc-msvc -disable-post-ra -verify-machineinstrs \
+; RUN: -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefixes=MSVC-OUTLINE-ATOMICS
+
@var8 = dso_local global i8 0
@var16 = dso_local global i16 0
@@ -17,6 +25,12 @@
define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var8
+; CHECK-NEXT: add x8, x8, :lo12:var8
+; CHECK-NEXT: ldaddalb w0, w0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i8:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -25,19 +39,30 @@ define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd1_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i8:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var8
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd1_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw add ptr @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-; CHECK: ldaddalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i8 %old
}
define dso_local i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var16
+; CHECK-NEXT: add x8, x8, :lo12:var16
+; CHECK-NEXT: ldaddalh w0, w0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i16:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -46,19 +71,30 @@ define dso_local i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd2_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i16:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var16
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd2_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw add ptr @var16, i16 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-; CHECK: ldaddalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i16 %old
}
define dso_local i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var32
+; CHECK-NEXT: add x8, x8, :lo12:var32
+; CHECK-NEXT: ldaddal w0, w0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i32:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -67,19 +103,30 @@ define dso_local i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i32:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var32
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd4_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw add ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-; CHECK: ldaddal w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i32 %old
}
define dso_local i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var64
+; CHECK-NEXT: add x8, x8, :lo12:var64
+; CHECK-NEXT: ldaddal x0, x0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i64:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -88,19 +135,30 @@ define dso_local i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i64:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var64
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd8_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw add ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-; CHECK: ldaddal x[[OLD:[0-9]+]], x[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i64 %old
}
define dso_local void @test_atomic_load_add_i32_noret(i32 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i32_noret:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var32
+; CHECK-NEXT: add x8, x8, :lo12:var32
+; CHECK-NEXT: ldaddal w0, w8, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i32_noret:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -109,18 +167,29 @@ define dso_local void @test_atomic_load_add_i32_noret(i32 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i32_noret:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var32
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd4_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
atomicrmw add ptr @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-; CHECK: ldaddal w0, w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret void
}
define dso_local void @test_atomic_load_add_i64_noret(i64 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_add_i64_noret:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var64
+; CHECK-NEXT: add x8, x8, :lo12:var64
+; CHECK-NEXT: ldaddal x0, x8, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i64_noret:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -129,18 +198,29 @@ define dso_local void @test_atomic_load_add_i64_noret(i64 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_add_i64_noret:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var64
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldadd8_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
atomicrmw add ptr @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-; CHECK: ldaddal x0, x[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret void
}
define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_or_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var8
+; CHECK-NEXT: add x8, x8, :lo12:var8
+; CHECK-NEXT: ldsetalb w0, w0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_or_i8:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -149,19 +229,30 @@ define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldset1_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_or_i8:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var8
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldset1_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw or ptr @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-; CHECK: ldsetalb w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i8 %old
}
define dso_local i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_or_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var16
+; CHECK-NEXT: add x8, x8, :lo12:var16
+; CHECK-NEXT: ldsetalh w0, w0, [x8]
+; CHECK-NEXT: ret
+;
; OUTLINE-ATOMICS-LABEL: test_atomic_load_or_i16:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -170,19 +261,30 @@ define dso_local i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldset2_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
+;
+; MSVC-OUTLINE-ATOMICS-LABEL: test_atomic_load_or_i16:
+; MSVC-OUTLINE-ATOMICS: // %bb.0:
+; MSVC-OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MSVC-OUTLINE-ATOMICS-NEXT: adrp x1, var16
+; MSVC-OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; MSVC-OUTLINE-ATOMICS-NEXT: bl __aarch64_ldset2_acq_rel
+; MSVC-OUTLINE-ATOMICS-NEXT: dmb ish
+; MSVC-OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MSVC-OUTLINE-ATOMICS-NEXT: ret
%old = atomicrmw or ptr @var16, i16 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-; CHECK: ldsetalh w[[OLD:[0-9]+]], w[[NEW:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
ret i16 %old
}
define dso_local i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_or_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, var32
+; CHECK-NEXT: add x8, x8, :lo12:var32
+; CHECK-NEXT: ldsetal w0, w0, [x8]
+; CHECK-NEXT: ret...
[truncated]
|
Change-Id: Iea7753ccc9fe18c8fb70ab0dc4a1117cd619f49f
Change-Id: Iede7601d616f29dff3e04182d94e9a5f233ccf66
|
Trying some stuff using https://developer.arm.com/herd7 it looks not having DMB after LDADDL is correct. Trying the following example (based on what my comment in https://reviews.llvm.org/D141748) gives the answer "No", meaning it's not possible for sequential consistency to be violated. By comparison, using LDADDL (i.e. no acquire) gives an answer of "Ok" meaning in that case it is possible for sequential consistency to be violated (and DMB after it fixes that). |
Update comment as well. Change-Id: If5929f66da99cf9750c9c5f73aed295c2a5cf734
c9821ab added extra fences after sequentially consistent stores for compatibility with MSVC's seq_cst loads (ldr+dmb). These extra fences should not be needed for ARM LSE instructions that have both acquire+release semantics, which results in a two way barrier, and should be enough for sequential consistency.
Fixes #162345
Change-Id: I9148c73d0dcf3bf1b18a0915f96cac71ac1800f2