diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c2113ca398..f7dff4ba4c5e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -273,6 +273,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +286,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75748012a228e..2cfcc62ed5982 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -465,6 +465,11 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt + EnableLowerExecSync("amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of exec sync pass."), + cl::init(true), cl::Hidden); + static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -963,6 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerExecSync) + PM.addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1331,6 +1338,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerExecSync) + addPass(createAMDGPULowerExecSyncLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -2076,6 +2087,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); + if (EnableLowerExecSync) + addPass(AMDGPULowerExecSyncPass()); + if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll new file mode 100644 index 0000000000000..bed8fa20a5044 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to ensure that LDS variables like named barriers are lowered correctly, +; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]] +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +;. +define void @func1() #0 { +; CHECK-LABEL: define void @func1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() #0 { +; CHECK-LABEL: define void @func2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @func2() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +; CHECK: [[META3]] = !{i32 0, i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll new file mode 100644 index 0000000000000..05f2f07c84503 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll @@ -0,0 +1,73 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario, +; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]] +; +define void @bar() #0 { +; CHECK-LABEL: define void @bar( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4 +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @barkernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @barkernel( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}}) +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: call void @bar() +; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4 +; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}}) +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @bar() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind sanitize_address } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address } +; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" } +;. +; CHECK: [[META0]] = !{i32 8396880, i32 8396881} +; CHECK: [[META1]] = !{i32 8396816, i32 8396817} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll index 782d94845a358..bde6db6463cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-exec-sync -mtriple=amdgcn-amd-amdhsa | FileCheck %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 3aa36635a0ab6..8342e84fb4f5d 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6e5212580ba2e..b3e07dc6d2838 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -39,6 +39,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles +; GCN-O0-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -187,6 +188,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles +; GCN-O1-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -474,6 +476,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles +; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -791,6 +794,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles +; GCN-O2-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1112,6 +1116,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles +; GCN-O3-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 03a666fbe3aea..9f3dfb01282bc 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }