Skip to content

Commit 6f02969

Browse files
authored
[OMPIRBuilder] Don't outline DISTRIBUTE on CPUs (#158317)
We use different OpenMP runtime functions on CPU and target offload. The one used for DISTRIBUTE on target offload needs a function pointer to an offloaded function, but the one on CPU doesn't. This caused unnessecary overhead on CPUs because SHARED or FIRSTPRIVATE memory from the surrounding context has to be packaged into a context structure just for an ordinary function call (which would hopefully eventually get inlined). This also makes the IR harder to read.
1 parent 2c12a3d commit 6f02969

File tree

7 files changed

+20
-44
lines changed

7 files changed

+20
-44
lines changed

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10036,12 +10036,16 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
1003610036
if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
1003710037
return Err;
1003810038

10039-
OutlineInfo OI;
10040-
OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10041-
OI.EntryBB = AllocaBB;
10042-
OI.ExitBB = ExitBB;
10039+
// When using target we use different runtime functions which require a
10040+
// callback.
10041+
if (Config.isTargetDevice()) {
10042+
OutlineInfo OI;
10043+
OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10044+
OI.EntryBB = AllocaBB;
10045+
OI.ExitBB = ExitBB;
1004310046

10044-
addOutlineInfo(std::move(OI));
10047+
addOutlineInfo(std::move(OI));
10048+
}
1004510049
Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1004610050

1004710051
return Builder.saveIP();

mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) {
3232
// CHECK: omp.region.cont6:
3333
// CHECK: br label %omp.region.cont4
3434
// CHECK: omp.region.cont4:
35-
// CHECK: br label %distribute.exit.exitStub
35+
// CHECK: br label %omp.par.exit.exitStub
3636
// CHECK: omp_loop.body:
3737
// CHECK: %[[VAL_111:.*]] = add i32 %{{.*}}, %{{.*}}
3838
// CHECK: %[[VAL_112:.*]] = mul i32 %[[VAL_111]], %{{.*}}
@@ -52,6 +52,6 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) {
5252
// CHECK: omp_loop.inc:
5353
// CHECK: %[[VAL_100:.*]] = add nuw i32 %{{.*}}, 1
5454
// CHECK: br label %omp_loop.header
55-
// CHECK: distribute.exit.exitStub:
55+
// CHECK: omp.par.exit.exitStub:
5656
// CHECK: ret void
5757

mlir/test/Target/LLVMIR/openmp-distribute-private.mlir

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,6 @@ llvm.func @_QQmain() {
3434
// CHECK: }
3535

3636
// CHECK: define internal void @[[TEAMS_FUNC]]({{.*}}) {
37-
// CHECK: call void @[[DIST_FUNC:.*]]()
38-
// CHECK-NEXT: br label %distribute.exit
39-
// CHECK: }
40-
41-
// CHECK: define internal void @[[DIST_FUNC]]() {
4237
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4
4338
// CHECK: %[[IV_ALLOC:.*]] = alloca i32, align 4
4439

@@ -78,29 +73,22 @@ llvm.func @_QQmain() {
7873

7974
// CHECK-LABEL: define void @_QQmain() {
8075
// CHECK: %[[SHARED_VAR_ALLOC:.*]] = alloca float, i64 1, align 4
81-
// CHECK: %[[SHARED_VAR_PTR:.*]] = getelementptr { ptr }, ptr %[[DIST_PARAM:.*]], i32 0, i32 0
82-
// CHECK: store ptr %[[SHARED_VAR_ALLOC]], ptr %[[SHARED_VAR_PTR]], align 8
83-
// CHECK: call void @[[DIST_FUNC:.*]](ptr %[[DIST_PARAM]])
84-
// CHECK-NEXT: br label %distribute.exit
85-
// CHECK: }
8676

87-
// CHECK: define internal void @[[DIST_FUNC]](ptr %[[DIST_ARG:.*]]) {
88-
// CHECK: %[[SHARED_VAR_GEP:.*]] = getelementptr { ptr }, ptr %[[DIST_ARG]], i32 0, i32 0
89-
// CHECK: %[[SHARED_VAR_PTR2:.*]] = load ptr, ptr %[[SHARED_VAR_GEP]], align 8
77+
// CHECK: distribute.alloca:
9078
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4
9179

9280
// CHECK: omp.private.copy:
93-
// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_PTR2]], align 4
81+
// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_ALLOC]], align 4
9482
// CHECK-NEXT: store float %[[SHARED_VAR_VAL]], ptr %[[PRIV_VAR_ALLOC]], align 4
9583

84+
// CHECK: omp.loop_nest.region:
85+
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
86+
9687
// CHECK: omp_loop.after:
9788
// CHECK-NEXT: br label %omp.region.cont
9889

9990
// CHECK: omp.region.cont:
10091
// CHECK-NEXT: call void @foo_free(ptr %[[PRIV_VAR_ALLOC]])
101-
102-
// CHECK: omp.loop_nest.region:
103-
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
10492
// CHECK: }
10593

10694

mlir/test/Target/LLVMIR/openmp-llvm.mlir

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3339,12 +3339,6 @@ llvm.func @distribute() {
33393339
}
33403340

33413341
// CHECK-LABEL: define void @distribute
3342-
// CHECK: call void @[[OUTLINED:.*]]({{.*}})
3343-
// CHECK-NEXT: br label %[[EXIT:.*]]
3344-
// CHECK: [[EXIT]]:
3345-
// CHECK: ret void
3346-
3347-
// CHECK: define internal void @[[OUTLINED]]({{.*}})
33483342
// CHECK: %[[LASTITER:.*]] = alloca i32
33493343
// CHECK: %[[LB:.*]] = alloca i64
33503344
// CHECK: %[[UB:.*]] = alloca i64
@@ -3381,9 +3375,7 @@ llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
33813375
// CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
33823376

33833377
// CHECK: define internal void @[[OUTLINED_PARALLEL]]
3384-
// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]]({{.*}})
3385-
3386-
// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]]
3378+
// CHECK: distribute.alloca:
33873379
// CHECK: %[[LASTITER:.*]] = alloca i32
33883380
// CHECK: %[[LB:.*]] = alloca i32
33893381
// CHECK: %[[UB:.*]] = alloca i32

mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
4949
// HOST: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}})
5050

5151
// HOST: define internal void @[[TEAMS_OUTLINE]]
52-
// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})
53-
54-
// HOST: define internal void @[[DISTRIBUTE_OUTLINE]]
5552
// HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
5653
// HOST: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}})
5754

mlir/test/Target/LLVMIR/openmp-target-spmd.mlir

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
4646
// HOST: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}})
4747

4848
// HOST: define internal void @[[PARALLEL_OUTLINE]]
49-
// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})
50-
51-
// HOST: define internal void @[[DISTRIBUTE_OUTLINE]]
5249
// HOST: call void @__kmpc_dist_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
5350

5451
//--- device.mlir

mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,20 @@
33
// Check that omp.simd as a leaf of a composite construct still generates
44
// the appropriate loop vectorization attribute.
55

6-
// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.2
6+
// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1
77
// CHECK: teams.body:
88
// CHECK: omp.teams.region:
99

10-
// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1
10+
// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par
1111
// CHECK: omp.par.entry:
1212
// CHECK: omp.par.region:
13-
// CHECK: distribute.exit:
14-
15-
// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par
1613
// CHECK: distribute.body:
1714
// CHECK: omp.distribute.region:
1815
// CHECK: omp_loop.header:
1916
// CHECK: omp_loop.inc:
2017
// CHECK-NEXT: %omp_loop.next = add nuw i32 %omp_loop.iv, 1
2118
// CHECK-NEXT: br label %omp_loop.header, !llvm.loop ![[LOOP_ATTR:.*]]
19+
// CHECK: omp.par.exit.exitStub:
2220

2321
// CHECK: ![[LOOP_ATTR]] = distinct !{![[LOOP_ATTR]], ![[LPAR:.*]], ![[LVEC:.*]]}
2422
// CHECK: ![[LPAR]] = !{!"llvm.loop.parallel_accesses", ![[PAR_ACC:.*]]}

0 commit comments

Comments
 (0)