From e19aa7e079e4999bec24e77f218443a04a90f879 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 12 Sep 2025 15:25:56 +0000 Subject: [PATCH] [OMPIRBuilder] Don't outline DISTRIBUTE on CPUs We use different OpenMP runtime functions on CPU and target offload. The one used for DISTRIBUTE on target offload needs a function pointer to an offloaded function, but the one on CPU doesn't. This caused unnessecary overhead on CPUs because SHARED or FIRSTPRIVATE memory from the surrounding context has to be packaged into a context structure just for an ordinary function call (which would hopefully eventually get inlined). This also makes the IR harder to read. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 14 +++++++----- ...penmp-cancel-distribute-parallel-loop.mlir | 4 ++-- .../LLVMIR/openmp-distribute-private.mlir | 22 +++++-------------- mlir/test/Target/LLVMIR/openmp-llvm.mlir | 10 +-------- .../LLVMIR/openmp-target-generic-spmd.mlir | 3 --- .../Target/LLVMIR/openmp-target-spmd.mlir | 3 --- ...nmp-teams-distribute-parallel-do-simd.mlir | 8 +++---- 7 files changed, 20 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 6d948f184392d..9b67465faab0b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -10036,12 +10036,16 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc, if (Error Err = BodyGenCB(AllocaIP, CodeGenIP)) return Err; - OutlineInfo OI; - OI.OuterAllocaBB = OuterAllocaIP.getBlock(); - OI.EntryBB = AllocaBB; - OI.ExitBB = ExitBB; + // When using target we use different runtime functions which require a + // callback. + if (Config.isTargetDevice()) { + OutlineInfo OI; + OI.OuterAllocaBB = OuterAllocaIP.getBlock(); + OI.EntryBB = AllocaBB; + OI.ExitBB = ExitBB; - addOutlineInfo(std::move(OI)); + addOutlineInfo(std::move(OI)); + } Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Builder.saveIP(); diff --git a/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir b/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir index 2339022be8979..b91c97738f87f 100644 --- a/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir +++ b/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir @@ -32,7 +32,7 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) { // CHECK: omp.region.cont6: // CHECK: br label %omp.region.cont4 // CHECK: omp.region.cont4: -// CHECK: br label %distribute.exit.exitStub +// CHECK: br label %omp.par.exit.exitStub // CHECK: omp_loop.body: // CHECK: %[[VAL_111:.*]] = add i32 %{{.*}}, %{{.*}} // CHECK: %[[VAL_112:.*]] = mul i32 %[[VAL_111]], %{{.*}} @@ -52,6 +52,6 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) { // CHECK: omp_loop.inc: // CHECK: %[[VAL_100:.*]] = add nuw i32 %{{.*}}, 1 // CHECK: br label %omp_loop.header -// CHECK: distribute.exit.exitStub: +// CHECK: omp.par.exit.exitStub: // CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir b/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir index 188c12ebfd3c7..ef118e0ad1df2 100644 --- a/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir +++ b/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir @@ -34,11 +34,6 @@ llvm.func @_QQmain() { // CHECK: } // CHECK: define internal void @[[TEAMS_FUNC]]({{.*}}) { -// CHECK: call void @[[DIST_FUNC:.*]]() -// CHECK-NEXT: br label %distribute.exit -// CHECK: } - -// CHECK: define internal void @[[DIST_FUNC]]() { // CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4 // CHECK: %[[IV_ALLOC:.*]] = alloca i32, align 4 @@ -78,29 +73,22 @@ llvm.func @_QQmain() { // CHECK-LABEL: define void @_QQmain() { // CHECK: %[[SHARED_VAR_ALLOC:.*]] = alloca float, i64 1, align 4 -// CHECK: %[[SHARED_VAR_PTR:.*]] = getelementptr { ptr }, ptr %[[DIST_PARAM:.*]], i32 0, i32 0 -// CHECK: store ptr %[[SHARED_VAR_ALLOC]], ptr %[[SHARED_VAR_PTR]], align 8 -// CHECK: call void @[[DIST_FUNC:.*]](ptr %[[DIST_PARAM]]) -// CHECK-NEXT: br label %distribute.exit -// CHECK: } -// CHECK: define internal void @[[DIST_FUNC]](ptr %[[DIST_ARG:.*]]) { -// CHECK: %[[SHARED_VAR_GEP:.*]] = getelementptr { ptr }, ptr %[[DIST_ARG]], i32 0, i32 0 -// CHECK: %[[SHARED_VAR_PTR2:.*]] = load ptr, ptr %[[SHARED_VAR_GEP]], align 8 +// CHECK: distribute.alloca: // CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4 // CHECK: omp.private.copy: -// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_PTR2]], align 4 +// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_ALLOC]], align 4 // CHECK-NEXT: store float %[[SHARED_VAR_VAL]], ptr %[[PRIV_VAR_ALLOC]], align 4 +// CHECK: omp.loop_nest.region: +// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4 + // CHECK: omp_loop.after: // CHECK-NEXT: br label %omp.region.cont // CHECK: omp.region.cont: // CHECK-NEXT: call void @foo_free(ptr %[[PRIV_VAR_ALLOC]]) - -// CHECK: omp.loop_nest.region: -// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4 // CHECK: } diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 27210bc0890ce..8bd33a382197e 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -3339,12 +3339,6 @@ llvm.func @distribute() { } // CHECK-LABEL: define void @distribute -// CHECK: call void @[[OUTLINED:.*]]({{.*}}) -// CHECK-NEXT: br label %[[EXIT:.*]] -// CHECK: [[EXIT]]: -// CHECK: ret void - -// CHECK: define internal void @[[OUTLINED]]({{.*}}) // CHECK: %[[LASTITER:.*]] = alloca i32 // CHECK: %[[LB:.*]] = alloca i64 // CHECK: %[[UB:.*]] = alloca i64 @@ -3381,9 +3375,7 @@ llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) { // CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]], // CHECK: define internal void @[[OUTLINED_PARALLEL]] -// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]]({{.*}}) - -// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]] +// CHECK: distribute.alloca: // CHECK: %[[LASTITER:.*]] = alloca i32 // CHECK: %[[LB:.*]] = alloca i32 // CHECK: %[[UB:.*]] = alloca i32 diff --git a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir index 9bb2b40a43def..504d91b1f6198 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir @@ -49,9 +49,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // HOST: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}}) // HOST: define internal void @[[TEAMS_OUTLINE]] -// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}}) - -// HOST: define internal void @[[DISTRIBUTE_OUTLINE]] // HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) // HOST: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}}) diff --git a/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir index 86dff678bf639..20202fc7fc16c 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir @@ -46,9 +46,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // HOST: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}}) // HOST: define internal void @[[PARALLEL_OUTLINE]] -// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}}) - -// HOST: define internal void @[[DISTRIBUTE_OUTLINE]] // HOST: call void @__kmpc_dist_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) //--- device.mlir diff --git a/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir b/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir index 4d766cc1ac4f4..69d5d225d0515 100644 --- a/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir @@ -3,22 +3,20 @@ // Check that omp.simd as a leaf of a composite construct still generates // the appropriate loop vectorization attribute. -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.2 +// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1 // CHECK: teams.body: // CHECK: omp.teams.region: -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1 +// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par // CHECK: omp.par.entry: // CHECK: omp.par.region: -// CHECK: distribute.exit: - -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par // CHECK: distribute.body: // CHECK: omp.distribute.region: // CHECK: omp_loop.header: // CHECK: omp_loop.inc: // CHECK-NEXT: %omp_loop.next = add nuw i32 %omp_loop.iv, 1 // CHECK-NEXT: br label %omp_loop.header, !llvm.loop ![[LOOP_ATTR:.*]] +// CHECK: omp.par.exit.exitStub: // CHECK: ![[LOOP_ATTR]] = distinct !{![[LOOP_ATTR]], ![[LPAR:.*]], ![[LVEC:.*]]} // CHECK: ![[LPAR]] = !{!"llvm.loop.parallel_accesses", ![[PAR_ACC:.*]]}