Skip to content

Commit

Permalink
[Remarks] Emit optimization remarks for atomics generating CAS loop
Browse files Browse the repository at this point in the history
Implements ORE in AtomicExpand pass to report atomics generating
a compare and swap loop.

Differential Revision: https://reviews.llvm.org/D106891
  • Loading branch information
gandhi56 committed Aug 15, 2021
1 parent 530aa7e commit 4357852
Show file tree
Hide file tree
Showing 11 changed files with 249 additions and 7 deletions.
16 changes: 16 additions & 0 deletions clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
@@ -0,0 +1,16 @@
// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \
// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \
// RUN: FileCheck %s --check-prefix=GFX90A-CAS

// REQUIRES: amdgpu-registered-target

#include "Inputs/cuda.h"
#include <stdatomic.h>

// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc
// GFX90A-CAS: s_cbranch_execnz
__device__ float atomic_add_cas(float *p) {
return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
}
46 changes: 46 additions & 0 deletions clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl
@@ -0,0 +1,46 @@
// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \
// RUN: FileCheck %s --check-prefix=REMARK

// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \
// RUN: FileCheck %s --check-prefix=GFX90A-CAS

// REQUIRES: amdgpu-registered-target

typedef enum memory_order {
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;

typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;

// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand]
// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand]
// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand]
// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand]
// GFX90A-CAS-LABEL: @atomic_cas
// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic
// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic
// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic
// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic
float atomic_cas(__global atomic_float *d, float a) {
float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device);
float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices);
float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group);
}



23 changes: 22 additions & 1 deletion llvm/lib/CodeGen/AtomicExpandPass.cpp
Expand Up @@ -17,6 +17,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/CodeGen/AtomicExpandUtils.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/TargetLowering.h"
Expand Down Expand Up @@ -58,6 +59,7 @@ namespace {

class AtomicExpand: public FunctionPass {
const TargetLowering *TLI = nullptr;
OptimizationRemarkEmitter *ORE;

public:
static char ID; // Pass identification, replacement for typeid
Expand All @@ -69,6 +71,7 @@ namespace {
bool runOnFunction(Function &F) override;

private:
void getAnalysisUsage(AnalysisUsage &AU) const override;
bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
Expand Down Expand Up @@ -165,11 +168,16 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
}

void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}

bool AtomicExpand::runOnFunction(Function &F) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;

ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto &TM = TPC->getTM<TargetMachine>();
if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
return false;
Expand Down Expand Up @@ -570,7 +578,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
}

bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
LLVMContext &Ctx = AI->getModule()->getContext();
TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
switch (Kind) {
case TargetLoweringBase::AtomicExpansionKind::None:
return false;
case TargetLoweringBase::AtomicExpansionKind::LLSC: {
Expand Down Expand Up @@ -600,6 +610,17 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
expandPartwordAtomicRMW(AI,
TargetLoweringBase::AtomicExpansionKind::CmpXChg);
} else {
SmallVector<StringRef> SSNs;
Ctx.getSyncScopeNames(SSNs);
auto MemScope = SSNs[AI->getSyncScopeID()].empty()
? "system"
: SSNs[AI->getSyncScopeID()];
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction())
<< "A compare and swap loop was generated for an atomic "
<< AI->getOperationName(AI->getOperation()) << " operation at "
<< MemScope << " memory scope";
});
expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
}
return true;
Expand Down
7 changes: 6 additions & 1 deletion llvm/test/CodeGen/AArch64/O0-pipeline.ll
Expand Up @@ -8,13 +8,18 @@
; CHECK-NEXT: Target Pass Configuration
; CHECK-NEXT: Machine Module Information
; CHECK-NEXT: Target Transform Information
; CHECK-NEXT: Profile summary info
; CHECK-NEXT: Create Garbage Collector Module Metadata
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Profile summary info
; CHECK-NEXT: Machine Branch Probability Analysis
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
Expand Down
7 changes: 6 additions & 1 deletion llvm/test/CodeGen/AArch64/O3-pipeline.ll
Expand Up @@ -8,15 +8,20 @@
; CHECK-NEXT: Target Pass Configuration
; CHECK-NEXT: Machine Module Information
; CHECK-NEXT: Target Transform Information
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Profile summary info
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Type-Based Alias Analysis
; CHECK-NEXT: Scoped NoAlias Alias Analysis
; CHECK-NEXT: Create Garbage Collector Module Metadata
; CHECK-NEXT: Machine Branch Probability Analysis
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: SVE intrinsics optimizations
; CHECK-NEXT: FunctionPass Manager
Expand Down
103 changes: 103 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll
@@ -0,0 +1,103 @@
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS

; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope
; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope

; GFX90A-CAS-LABEL: atomic_add_cas:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_agent:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_agent(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_workgroup:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_wavefront:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_singlethread:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_one_as:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_one_as(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4
ret void
}

; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as:
; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-CAS: s_cbranch_execnz
define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) {
entry:
%ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4
ret void
}
25 changes: 25 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Expand Up @@ -44,6 +44,11 @@
; GCN-O0-NEXT: Lower OpenCL enqueued blocks
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: Natural Loop Information
; GCN-O0-NEXT: Lazy Branch Probability Analysis
; GCN-O0-NEXT: Lazy Block Frequency Analysis
; GCN-O0-NEXT: Optimization Remark Emitter
; GCN-O0-NEXT: Expand Atomic instructions
; GCN-O0-NEXT: Lower constant intrinsics
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
Expand Down Expand Up @@ -180,6 +185,11 @@
; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Infer address spaces
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Lazy Branch Probability Analysis
; GCN-O1-NEXT: Lazy Block Frequency Analysis
; GCN-O1-NEXT: Optimization Remark Emitter
; GCN-O1-NEXT: Expand Atomic instructions
; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
Expand Down Expand Up @@ -431,6 +441,11 @@
; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Infer address spaces
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
Expand Down Expand Up @@ -715,6 +730,11 @@
; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Infer address spaces
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Lazy Branch Probability Analysis
; GCN-O2-NEXT: Lazy Block Frequency Analysis
; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Expand Atomic instructions
; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
Expand Down Expand Up @@ -1001,6 +1021,11 @@
; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Infer address spaces
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Lazy Branch Probability Analysis
; GCN-O3-NEXT: Lazy Block Frequency Analysis
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Expand Atomic instructions
; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/ARM/O3-pipeline.ll
Expand Up @@ -5,6 +5,11 @@
; CHECK: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Dominator Tree Construction
Expand Down
10 changes: 8 additions & 2 deletions llvm/test/CodeGen/PowerPC/O3-pipeline.ll
Expand Up @@ -8,16 +8,21 @@
; CHECK-NEXT: Target Pass Configuration
; CHECK-NEXT: Machine Module Information
; CHECK-NEXT: Target Transform Information
; CHECK-NEXT: Profile summary info
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Type-Based Alias Analysis
; CHECK-NEXT: Scoped NoAlias Alias Analysis
; CHECK-NEXT: Profile summary info
; CHECK-NEXT: Create Garbage Collector Module Metadata
; CHECK-NEXT: Machine Branch Probability Analysis
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Convert i1 constants to i32/i64 if they are returned
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: PPC Lower MASS Entries
; CHECK-NEXT: FunctionPass Manager
Expand Down Expand Up @@ -206,4 +211,5 @@

define void @f() {
ret void
}
}

0 comments on commit 4357852

Please sign in to comment.