-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMPIRBuilder] CodeExtractor Aggregate Allocation Placed at Wrong Point in Nested OpenMP Parallel #54165
Comments
@llvm/issue-subscribers-mlir-llvm |
I have identified the reason in the LLVM IR, but not its root cause. The IR looks as follows (see comments): ;;;;;;;
define i32 @main(i32 %0, i8** %1) {
%structArg35 = alloca { { i64* }* }, align 8
; a struct with pointer in it is allocated in @main
%structArg = alloca { i64* }, align 8
; the pointer to this struct is passed into the function executed in parallel
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %structArg35, i32 0, i32 0
store { i64* }* %structArg, { i64* }** %gep_structArg, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i64* }* }*)* @main..omp_par.1 to void (i32*, i32*, ...)*), { { i64* }* }* %structArg35), !dbg !9
; ....
}
;;;;;;;;
define internal void @main..omp_par.1(i32* noalias %tid.addr, i32* noalias %zero.addr, { { i64* }* }* %0) {
; The structured containing the pointer is extracted from arguments
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %0, i32 0, i32 0
%loadgep_structArg = load { i64* }*, { i64* }** %gep_structArg, align 8
; Alloca coming from MLIR
%2 = alloca i64, i64 1, align 8, !dbg !12
; Getting the pointer stored in the structure.
; PROBLEM: this points back to the stack of @main and is shared by all threads
; all thread race to write their local "alloca"ed addresses into the same place
%gep_ = getelementptr { i64* }, { i64* }* %loadgep_structArg, i32 0, i32 0
store i64* %2, i64** %gep_, align 8
; Inside this function, we will read the pointer and write 0 to it, but the pointer
; may be pointing to memory "alloca"ed on the stack of another thread, leaving
; the memory of this thread uninitialized.
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @main..omp_par to void (i32*, i32*, ...)*), { i64* }* %loadgep_structArg), !dbg !16
}
Replacing |
Debug output of OpenMPIRBuilder Created OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Before body codegen: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.pre_finalize
omp.par.pre_finalize: ; preds = %omp.par.region
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Before body codegen: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.pre_finalize7
omp.par.pre_finalize7: ; preds = %omp.par.region6
br label %omp.par.exit8
omp.par.exit8: ; preds = %omp.par.pre_finalize7
unreachable
omp_loop.preheader: ; preds = %omp.par.region1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, 10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%4 = mul i64 %omp_loop.iv, 1
%5 = add i64 %4, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; No predecessors!
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; No predecessors!
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
Created OpenMP runtime function __kmpc_for_static_init_8u with type void (%struct.ident_t*, i32, i32, i32*, i64*, i64*, i64*, i64, i64)
Created OpenMP runtime function __kmpc_for_static_fini with type void (%struct.ident_t*, i32)
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Created OpenMP runtime function __kmpc_barrier with type void (%struct.ident_t*, i32)
After body codegen: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.exit8
omp.par.exit8: ; preds = %omp.par.pre_finalize7
unreachable
omp_loop.preheader: ; preds = %omp.par.region1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, 10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = mul i64 %omp_loop.iv, 1
%12 = add i64 %11, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; No predecessors!
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; No predecessors!
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
Created OpenMP runtime function __kmpc_fork_call with type void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
Before privatization: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
unreachable
omp_loop.preheader: ; preds = %omp.par.region1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, 10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = mul i64 %omp_loop.iv, 1
%12 = add i64 %11, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; No predecessors!
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; No predecessors!
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Captured input: %tid.addr3 = alloca i32, align 4
Captured input: %zero.addr4 = alloca i32, align 4
Captured input: %3 = alloca i64, i64 1, align 8, !dbg !9
After privatization: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
unreachable
omp_loop.preheader: ; preds = %omp.par.region1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, 10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = mul i64 %omp_loop.iv, 1
%12 = add i64 %11, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; No predecessors!
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; No predecessors!
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
PBR: omp.par.entry5
PBR: omp.par.region6
PBR: omp.par.region13
PBR: omp_loop.preheader14
PBR: omp_loop.header15
PBR: omp_loop.cond16
PBR: omp_loop.exit19
PBR: omp_loop.after20
PBR: omp.par.pre_finalize7
PBR: omp_loop.body17
PBR: omp.wsloop.region25
PBR: omp.wsloop.exit24
PBR: omp_loop.inc18
PBR: omp.par.outlined.exit
Found OpenMP runtime function __kmpc_for_static_init_8u with type void (%struct.ident_t*, i32, i32, i32*, i64*, i64*, i64*, i64, i64)
Found OpenMP runtime function __kmpc_for_static_fini with type void (%struct.ident_t*, i32)
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Found OpenMP runtime function __kmpc_barrier with type void (%struct.ident_t*, i32)
After body codegen: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%11 = load i64, i64* %3, align 8, !dbg !15
%12 = trunc i64 %11 to i32, !dbg !16
%13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %12, i32 %12), !dbg !17
br label %omp.wsloop.exit, !dbg !18
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%14 = load i64, i64* %p.lowerbound29, align 8
%15 = load i64, i64* %p.upperbound30, align 8
%16 = sub i64 %15, %14
%17 = add i64 %16, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %17
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%18 = add i64 %omp_loop.iv, %14, !dbg !19
%19 = mul i64 %18, 1
%20 = add i64 %19, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !19
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !19
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !20
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.exit
omp.par.exit: ; preds = %omp.par.pre_finalize
unreachable
}
Found OpenMP runtime function __kmpc_fork_call with type void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
Before privatization: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%11 = load i64, i64* %3, align 8, !dbg !15
%12 = trunc i64 %11 to i32, !dbg !16
%13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %12, i32 %12), !dbg !17
br label %omp.wsloop.exit, !dbg !18
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%14 = load i64, i64* %p.lowerbound29, align 8
%15 = load i64, i64* %p.upperbound30, align 8
%16 = sub i64 %15, %14
%17 = add i64 %16, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %17
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%18 = add i64 %omp_loop.iv, %14, !dbg !19
%19 = mul i64 %18, 1
%20 = add i64 %19, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !19
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !19
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !20
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
unreachable
}
Found OpenMP runtime function __kmpc_global_thread_num with type i32 (%struct.ident_t*)
Captured input: %tid.addr = alloca i32, align 4, !dbg !7
Captured input: %zero.addr = alloca i32, align 4, !dbg !7
After privatization: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%11 = load i64, i64* %3, align 8, !dbg !15
%12 = trunc i64 %11 to i32, !dbg !16
%13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %12, i32 %12), !dbg !17
br label %omp.wsloop.exit, !dbg !18
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%14 = load i64, i64* %p.lowerbound29, align 8
%15 = load i64, i64* %p.upperbound30, align 8
%16 = sub i64 %15, %14
%17 = add i64 %16, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %17
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%18 = add i64 %omp_loop.iv, %14, !dbg !19
%19 = mul i64 %18, 1
%20 = add i64 %19, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !19
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !19
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !20
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
unreachable
}
PBR: omp.par.entry
PBR: omp.par.region
PBR: omp.par.region1
PBR: omp_loop.preheader
PBR: omp_loop.header
PBR: omp_loop.cond
PBR: omp_loop.exit
PBR: omp_loop.after
PBR: omp.par.pre_finalize
PBR: omp_loop.body
PBR: omp.wsloop.region
PBR: omp.par.entry5
PBR: omp.par.region6
PBR: omp.par.region13
PBR: omp_loop.preheader14
PBR: omp_loop.header15
PBR: omp_loop.cond16
PBR: omp_loop.exit19
PBR: omp_loop.after20
PBR: omp.par.pre_finalize7
PBR: omp.par.outlined.exit
PBR: omp.par.exit8.split
PBR: omp.wsloop.exit
PBR: omp_loop.inc
PBR: omp_loop.body17
PBR: omp.wsloop.region25
PBR: omp.wsloop.exit24
PBR: omp_loop.inc18
PBR: omp.par.outlined.exit34
Before outlining: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp.par.entry5
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %3, align 8, !dbg !11
br label %omp.wsloop.exit24, !dbg !12
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%4 = load i64, i64* %p.lowerbound, align 8
%5 = load i64, i64* %p.upperbound, align 8
%6 = sub i64 %5, %4
%7 = add i64 %6, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %7
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.body17: ; preds = %omp_loop.cond16
%8 = add i64 %omp_loop.iv21, %4, !dbg !13
%9 = mul i64 %8, 1
%10 = add i64 %9, 0
br label %omp.wsloop.region25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !13
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !14
omp.par.entry5: ; preds = %omp.wsloop.region
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp.par.pre_finalize7
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%11 = load i64, i64* %3, align 8, !dbg !15
%12 = trunc i64 %11 to i32, !dbg !16
%13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %12, i32 %12), !dbg !17
br label %omp.wsloop.exit, !dbg !18
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%14 = load i64, i64* %p.lowerbound29, align 8
%15 = load i64, i64* %p.upperbound30, align 8
%16 = sub i64 %15, %14
%17 = add i64 %16, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %17
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%18 = add i64 %omp_loop.iv, %14, !dbg !19
%19 = mul i64 %18, 1
%20 = add i64 %19, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !19
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !19
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !20
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !21
}
Entry omp.par.entry5 Exit: omp.par.outlined.exit
After outlining: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg = alloca { i64* }, align 8
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %codeRepl
codeRepl: ; preds = %omp.wsloop.region
%gep_ = getelementptr { i64* }, { i64* }* %structArg, i32 0, i32 0
store i64* %3, i64** %gep_, align 8
call void @main..omp_par(i32* %tid.addr3, i32* %zero.addr4, { i64* }* %structArg), !dbg !11
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %codeRepl
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%4 = load i64, i64* %3, align 8, !dbg !12
%5 = trunc i64 %4 to i32, !dbg !13
%6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %5, i32 %5), !dbg !14
br label %omp.wsloop.exit, !dbg !15
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%7 = load i64, i64* %p.lowerbound29, align 8
%8 = load i64, i64* %p.upperbound30, align 8
%9 = sub i64 %8, %7
%10 = add i64 %9, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = add i64 %omp_loop.iv, %7, !dbg !16
%12 = mul i64 %11, 1
%13 = add i64 %12, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !16
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !16
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !17
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !18
}
Outlined function: define internal void @main..omp_par(i32* %tid.addr3, i32* %zero.addr4, { i64* }* %0) !dbg !21 {
newFuncRoot:
%gep_ = getelementptr { i64* }, { i64* }* %0, i32 0, i32 0
%loadgep_ = load i64*, i64** %gep_, align 8
br label %omp.par.entry5, !dbg !22
omp.par.entry5: ; preds = %newFuncRoot
%tid.addr.local9 = alloca i32, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%tid.addr.use11 = load i32, i32* %tid.addr3, align 4
%zero.addr.use12 = load i32, i32* %zero.addr4, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%1 = load i64, i64* %p.lowerbound, align 8
%2 = load i64, i64* %p.upperbound, align 8
%3 = sub i64 %2, %1
%4 = add i64 %3, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %4
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !22
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !22
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !23
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit.exitStub
omp_loop.body17: ; preds = %omp_loop.cond16
%5 = add i64 %omp_loop.iv21, %1, !dbg !22
%6 = mul i64 %5, 1
%7 = add i64 %6, 0
br label %omp.wsloop.region25
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %loadgep_, align 8, !dbg !24
br label %omp.wsloop.exit24, !dbg !25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp.par.outlined.exit.exitStub: ; preds = %omp.par.pre_finalize7
ret void
}
With fork_call placed: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg = alloca { i64* }, align 8
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp_parallel
omp_parallel: ; preds = %omp.wsloop.region
%gep_ = getelementptr { i64* }, { i64* }* %structArg, i32 0, i32 0
store i64* %3, i64** %gep_, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @main..omp_par to void (i32*, i32*, ...)*), { i64* }* %structArg), !dbg !11
call void @main..omp_par(i32* %tid.addr3, i32* %zero.addr4, { i64* }* %structArg), !dbg !11
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp_parallel
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%4 = load i64, i64* %3, align 8, !dbg !12
%5 = trunc i64 %4 to i32, !dbg !13
%6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %5, i32 %5), !dbg !14
br label %omp.wsloop.exit, !dbg !15
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%7 = load i64, i64* %p.lowerbound29, align 8
%8 = load i64, i64* %p.upperbound30, align 8
%9 = sub i64 %8, %7
%10 = add i64 %9, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = add i64 %omp_loop.iv, %7, !dbg !16
%12 = mul i64 %11, 1
%13 = add i64 %12, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !16
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !16
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !17
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%tid.addr3 = alloca i32, align 4
%zero.addr4 = alloca i32, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !18
}
Before outlining: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg = alloca { i64* }, align 8
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp.par.entry
omp.par.region1: ; preds = %omp.par.region
%3 = alloca i64, i64 1, align 8, !dbg !9
br label %omp_loop.preheader
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !10
br label %omp_parallel
omp_parallel: ; preds = %omp.wsloop.region
%gep_ = getelementptr { i64* }, { i64* }* %structArg, i32 0, i32 0
store i64* %3, i64** %gep_, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @main..omp_par to void (i32*, i32*, ...)*), { i64* }* %structArg), !dbg !11
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp_parallel
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%4 = load i64, i64* %3, align 8, !dbg !12
%5 = trunc i64 %4 to i32, !dbg !13
%6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %5, i32 %5), !dbg !14
br label %omp.wsloop.exit, !dbg !15
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%7 = load i64, i64* %p.lowerbound29, align 8
%8 = load i64, i64* %p.upperbound30, align 8
%9 = sub i64 %8, %7
%10 = add i64 %9, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %10
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.body: ; preds = %omp_loop.cond
%11 = add i64 %omp_loop.iv, %7, !dbg !16
%12 = mul i64 %11, 1
%13 = add i64 %12, 0
br label %omp.wsloop.region
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !16
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !16
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !17
omp.par.entry: ; preds = %parallel.entry
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp.par.pre_finalize
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !18
}
Entry omp.par.entry Exit: omp.par.outlined.exit34
After outlining: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg35 = alloca { { i64* }* }, align 8
%structArg = alloca { i64* }, align 8
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %codeRepl
codeRepl: ; preds = %parallel.entry
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %structArg35, i32 0, i32 0
store { i64* }* %structArg, { i64* }** %gep_structArg, align 8
call void @main..omp_par.1(i32* %tid.addr, i32* %zero.addr, { { i64* }* }* %structArg35), !dbg !9
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %codeRepl
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !10
}
Outlined function: define internal void @main..omp_par.1(i32* %tid.addr, i32* %zero.addr, { { i64* }* }* %0) !dbg !18 {
newFuncRoot:
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %0, i32 0, i32 0
%loadgep_structArg = load { i64* }*, { i64* }** %gep_structArg, align 8
br label %omp.par.entry, !dbg !19
omp.par.entry: ; preds = %newFuncRoot
%tid.addr.local = alloca i32, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%tid.addr.use = load i32, i32* %tid.addr, align 4
%zero.addr.use = load i32, i32* %zero.addr, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.region1: ; preds = %omp.par.region
%1 = alloca i64, i64 1, align 8, !dbg !19
br label %omp_loop.preheader
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%2 = load i64, i64* %p.lowerbound29, align 8
%3 = load i64, i64* %p.upperbound30, align 8
%4 = sub i64 %3, %2
%5 = add i64 %4, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %5
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !20
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !20
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !21
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34.exitStub
omp_loop.body: ; preds = %omp_loop.cond
%6 = add i64 %omp_loop.iv, %2, !dbg !20
%7 = mul i64 %6, 1
%8 = add i64 %7, 0
br label %omp.wsloop.region
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !22
br label %omp_parallel
omp_parallel: ; preds = %omp.wsloop.region
%gep_ = getelementptr { i64* }, { i64* }* %loadgep_structArg, i32 0, i32 0
store i64* %1, i64** %gep_, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @main..omp_par to void (i32*, i32*, ...)*), { i64* }* %loadgep_structArg), !dbg !23
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp_parallel
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%9 = load i64, i64* %1, align 8, !dbg !24
%10 = trunc i64 %9 to i32, !dbg !25
%11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %10, i32 %10), !dbg !26
br label %omp.wsloop.exit, !dbg !27
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp.par.outlined.exit34.exitStub: ; preds = %omp.par.pre_finalize
ret void
}
With fork_call placed: define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg35 = alloca { { i64* }* }, align 8
%structArg = alloca { i64* }, align 8
%tid.addr = alloca i32, align 4, !dbg !7
%zero.addr = alloca i32, align 4, !dbg !7
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp_parallel
omp_parallel: ; preds = %parallel.entry
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %structArg35, i32 0, i32 0
store { i64* }* %structArg, { i64* }** %gep_structArg, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i64* }* }*)* @main..omp_par.1 to void (i32*, i32*, ...)*), { { i64* }* }* %structArg35), !dbg !9
call void @main..omp_par.1(i32* %tid.addr, i32* %zero.addr, { { i64* }* }* %structArg35), !dbg !9
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp_parallel
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !10
}
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.ident_t = type { i32, i32, i32, i32, i8* }
@str0 = internal constant [29 x i8] c"WG size of kernel = %d X %d\0A\00"
@0 = private unnamed_addr constant [30 x i8] c";LLVMDialectModule;main;6;5;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 29, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i32 0, i32 0) }, align 8
@2 = private unnamed_addr constant [31 x i8] c";LLVMDialectModule;main;13;9;;\00", align 1
@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 30, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @2, i32 0, i32 0) }, align 8
@4 = private unnamed_addr constant [32 x i8] c";LLVMDialectModule;main;14;11;;\00", align 1
@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @4, i32 0, i32 0) }, align 8
@6 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @4, i32 0, i32 0) }, align 8
@7 = private unnamed_addr constant [31 x i8] c";LLVMDialectModule;main;12;7;;\00", align 1
@8 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 30, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @7, i32 0, i32 0) }, align 8
@9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 30, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @7, i32 0, i32 0) }, align 8
declare i8* @malloc(i64)
declare void @free(i8*)
declare i32 @printf(i8*, ...)
define i32 @main(i32 %0, i8** %1) !dbg !3 {
%structArg35 = alloca { { i64* }* }, align 8
%structArg = alloca { i64* }, align 8
br label %parallel.entry, !dbg !7
parallel.entry: ; preds = %2
%omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !7
br label %omp_parallel
omp_parallel: ; preds = %parallel.entry
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %structArg35, i32 0, i32 0
store { i64* }* %structArg, { i64* }** %gep_structArg, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { { i64* }* }*)* @main..omp_par.1 to void (i32*, i32*, ...)*), { { i64* }* }* %structArg35), !dbg !9
br label %omp.par.outlined.exit34
omp.par.outlined.exit34: ; preds = %omp_parallel
br label %omp.par.exit.split
omp.par.exit.split: ; preds = %omp.par.outlined.exit34
ret i32 0, !dbg !10
}
; Function Attrs: norecurse nounwind
define internal void @main..omp_par.1(i32* noalias %tid.addr, i32* noalias %zero.addr, { { i64* }* }* %0) #0 !dbg !11 {
omp.par.entry:
%gep_structArg = getelementptr { { i64* }* }, { { i64* }* }* %0, i32 0, i32 0
%loadgep_structArg = load { i64* }*, { i64* }** %gep_structArg, align 8
%tid.addr.local = alloca i32, align 4
%1 = load i32, i32* %tid.addr, align 4
store i32 %1, i32* %tid.addr.local, align 4
%tid = load i32, i32* %tid.addr.local, align 4
%p.lastiter28 = alloca i32, align 4
%p.lowerbound29 = alloca i64, align 8
%p.upperbound30 = alloca i64, align 8
%p.stride31 = alloca i64, align 8
br label %omp.par.region
omp.par.region: ; preds = %omp.par.entry
br label %omp.par.region1
omp.par.region1: ; preds = %omp.par.region
%2 = alloca i64, i64 1, align 8, !dbg !12
br label %omp_loop.preheader
omp_loop.preheader: ; preds = %omp.par.region1
store i64 0, i64* %p.lowerbound29, align 8
store i64 9, i64* %p.upperbound30, align 8
store i64 1, i64* %p.stride31, align 8
%omp_global_thread_num32 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @8, i32 %omp_global_thread_num32, i32 34, i32* %p.lastiter28, i64* %p.lowerbound29, i64* %p.upperbound30, i64* %p.stride31, i64 1, i64 0)
%3 = load i64, i64* %p.lowerbound29, align 8
%4 = load i64, i64* %p.upperbound30, align 8
%5 = sub i64 %4, %3
%6 = add i64 %5, 1
br label %omp_loop.header
omp_loop.header: ; preds = %omp_loop.inc, %omp_loop.preheader
%omp_loop.iv = phi i64 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
br label %omp_loop.cond
omp_loop.cond: ; preds = %omp_loop.header
%omp_loop.cmp = icmp ult i64 %omp_loop.iv, %6
br i1 %omp_loop.cmp, label %omp_loop.body, label %omp_loop.exit
omp_loop.exit: ; preds = %omp_loop.cond
call void @__kmpc_for_static_fini(%struct.ident_t* @8, i32 %omp_global_thread_num32)
%omp_global_thread_num33 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @8), !dbg !13
call void @__kmpc_barrier(%struct.ident_t* @9, i32 %omp_global_thread_num33), !dbg !13
br label %omp_loop.after
omp_loop.after: ; preds = %omp_loop.exit
br label %omp.par.pre_finalize, !dbg !14
omp.par.pre_finalize: ; preds = %omp_loop.after
br label %omp.par.outlined.exit34.exitStub
omp_loop.body: ; preds = %omp_loop.cond
%7 = add i64 %omp_loop.iv, %3, !dbg !13
%8 = mul i64 %7, 1
%9 = add i64 %8, 0
br label %omp.wsloop.region
omp.wsloop.region: ; preds = %omp_loop.body
%omp_global_thread_num2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @3), !dbg !15
br label %omp_parallel
omp_parallel: ; preds = %omp.wsloop.region
%gep_ = getelementptr { i64* }, { i64* }* %loadgep_structArg, i32 0, i32 0
store i64* %2, i64** %gep_, align 8
call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, { i64* }*)* @main..omp_par to void (i32*, i32*, ...)*), { i64* }* %loadgep_structArg), !dbg !16
br label %omp.par.outlined.exit
omp.par.outlined.exit: ; preds = %omp_parallel
br label %omp.par.exit8.split
omp.par.exit8.split: ; preds = %omp.par.outlined.exit
%10 = load i64, i64* %2, align 8, !dbg !17
%11 = trunc i64 %10 to i32, !dbg !18
%12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @str0, i32 0, i32 0), i32 %11, i32 %11), !dbg !19
br label %omp.wsloop.exit, !dbg !20
omp.wsloop.exit: ; preds = %omp.par.exit8.split
br label %omp_loop.inc
omp_loop.inc: ; preds = %omp.wsloop.exit
%omp_loop.next = add nuw i64 %omp_loop.iv, 1
br label %omp_loop.header
omp.par.outlined.exit34.exitStub: ; preds = %omp.par.pre_finalize
ret void
}
; Function Attrs: norecurse nounwind
define internal void @main..omp_par(i32* noalias %tid.addr3, i32* noalias %zero.addr4, { i64* }* %0) #0 !dbg !21 {
omp.par.entry5:
%gep_ = getelementptr { i64* }, { i64* }* %0, i32 0, i32 0
%loadgep_ = load i64*, i64** %gep_, align 8
%tid.addr.local9 = alloca i32, align 4
%1 = load i32, i32* %tid.addr3, align 4
store i32 %1, i32* %tid.addr.local9, align 4
%tid10 = load i32, i32* %tid.addr.local9, align 4
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i64, align 8
%p.upperbound = alloca i64, align 8
%p.stride = alloca i64, align 8
br label %omp.par.region6
omp.par.region6: ; preds = %omp.par.entry5
br label %omp.par.region13
omp.par.region13: ; preds = %omp.par.region6
br label %omp_loop.preheader14
omp_loop.preheader14: ; preds = %omp.par.region13
store i64 0, i64* %p.lowerbound, align 8
store i64 0, i64* %p.upperbound, align 8
store i64 1, i64* %p.stride, align 8
%omp_global_thread_num26 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5)
call void @__kmpc_for_static_init_8u(%struct.ident_t* @5, i32 %omp_global_thread_num26, i32 34, i32* %p.lastiter, i64* %p.lowerbound, i64* %p.upperbound, i64* %p.stride, i64 1, i64 0)
%2 = load i64, i64* %p.lowerbound, align 8
%3 = load i64, i64* %p.upperbound, align 8
%4 = sub i64 %3, %2
%5 = add i64 %4, 1
br label %omp_loop.header15
omp_loop.header15: ; preds = %omp_loop.inc18, %omp_loop.preheader14
%omp_loop.iv21 = phi i64 [ 0, %omp_loop.preheader14 ], [ %omp_loop.next23, %omp_loop.inc18 ]
br label %omp_loop.cond16
omp_loop.cond16: ; preds = %omp_loop.header15
%omp_loop.cmp22 = icmp ult i64 %omp_loop.iv21, %5
br i1 %omp_loop.cmp22, label %omp_loop.body17, label %omp_loop.exit19
omp_loop.exit19: ; preds = %omp_loop.cond16
call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %omp_global_thread_num26)
%omp_global_thread_num27 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @5), !dbg !22
call void @__kmpc_barrier(%struct.ident_t* @6, i32 %omp_global_thread_num27), !dbg !22
br label %omp_loop.after20
omp_loop.after20: ; preds = %omp_loop.exit19
br label %omp.par.pre_finalize7, !dbg !23
omp.par.pre_finalize7: ; preds = %omp_loop.after20
br label %omp.par.outlined.exit.exitStub
omp_loop.body17: ; preds = %omp_loop.cond16
%6 = add i64 %omp_loop.iv21, %2, !dbg !22
%7 = mul i64 %6, 1
%8 = add i64 %7, 0
br label %omp.wsloop.region25
omp.wsloop.region25: ; preds = %omp_loop.body17
store i64 0, i64* %loadgep_, align 8, !dbg !24
br label %omp.wsloop.exit24, !dbg !25
omp.wsloop.exit24: ; preds = %omp.wsloop.region25
br label %omp_loop.inc18
omp_loop.inc18: ; preds = %omp.wsloop.exit24
%omp_loop.next23 = add nuw i64 %omp_loop.iv21, 1
br label %omp_loop.header15
omp.par.outlined.exit.exitStub: ; preds = %omp.par.pre_finalize7
ret void
}
; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #1
; Function Attrs: nounwind
declare void @__kmpc_for_static_init_8u(%struct.ident_t*, i32, i32, i32*, i64*, i64*, i64*, i64, i64) #1
; Function Attrs: nounwind
declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #1
; Function Attrs: convergent nounwind
declare void @__kmpc_barrier(%struct.ident_t*, i32) #2
; Function Attrs: nounwind
declare !callback !26 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #1
attributes #0 = { norecurse nounwind }
attributes #1 = { nounwind }
attributes #2 = { convergent nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2}
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "mlir", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!1 = !DIFile(filename: "LLVMDialectModule", directory: "/")
!2 = !{i32 2, !"Debug Info Version", i32 3}
!3 = distinct !DISubprogram(name: "main", linkageName: "main", scope: null, file: !4, line: 5, type: !5, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !6)
!4 = !DIFile(filename: "guan.mlir", directory: "/home/wmoses/git/Polygeist/proj/rodinia/cuda/hotspot")
!5 = !DISubroutineType(types: !6)
!6 = !{}
!7 = !DILocation(line: 6, column: 5, scope: !8)
!8 = !DILexicalBlockFile(scope: !3, file: !4, discriminator: 0)
!9 = !DILocation(line: 11, column: 13, scope: !8)
!10 = !DILocation(line: 30, column: 5, scope: !8)
!11 = distinct !DISubprogram(name: "main..omp_par.1", linkageName: "main..omp_par.1", scope: null, file: !4, type: !5, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !6)
!12 = !DILocation(line: 11, column: 13, scope: !11)
!13 = !DILocation(line: 12, column: 7, scope: !11)
!14 = !DILocation(line: 27, column: 7, scope: !11)
!15 = !DILocation(line: 13, column: 9, scope: !11)
!16 = !DILocation(line: 14, column: 11, scope: !11)
!17 = !DILocation(line: 20, column: 15, scope: !11)
!18 = !DILocation(line: 21, column: 15, scope: !11)
!19 = !DILocation(line: 24, column: 15, scope: !11)
!20 = !DILocation(line: 25, column: 9, scope: !11)
!21 = distinct !DISubprogram(name: "main..omp_par", linkageName: "main..omp_par", scope: null, file: !4, type: !5, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !6)
!22 = !DILocation(line: 14, column: 11, scope: !21)
!23 = !DILocation(line: 18, column: 11, scope: !21)
!24 = !DILocation(line: 15, column: 13, scope: !21)
!25 = !DILocation(line: 16, column: 13, scope: !21)
!26 = !{!27}
!27 = !{i64 2, i64 -1, i64 -1, i1 true} |
@llvm/issue-subscribers-openmp |
@jdoerfert it appears that the issue is that the aggregate capture alloca for the inner parallel is mistakenly being placed at the entry function, thus causing it to be captured by the outer parallel and these issues. Really what should happen is that the codeextractor should place the alloca at the correct allocation scope (https://llvm.org/doxygen/OMPIRBuilder_8cpp_source.html#l00296) |
… parallel The OpenMPIRBuilder has a bug. Specifically, suppose you have two nested openmp parallel regions (writing with MLIR for ease) ``` omp.parallel { %a = ... omp.parallel { use(%a) } } ``` As OpenMP only permits pointer-like inputs, the builder will wrap all of the inputs into a stack allocation, and then pass this allocation to the inner parallel. For example, we would want to get something like the following: ``` omp.parallel { %a = ... %tmp = alloc store %tmp[] = %a kmpc_fork(outlined, %tmp) } ``` However, in practice, this is not what currently occurs in the context of nested parallel regions. Specifically to the OpenMPIRBuilder, the entirety of the function (at the LLVM level) is currently inlined with blocks marking the corresponding start and end of each region. ``` entry: ... parallel1: %a = ... ... parallel2: use(%a) ... endparallel2: ... endparallel1: ... ``` When the allocation is inserted, it presently inserted into the parent of the entire function (e.g. entry) rather than the parent allocation scope to the function being outlined. If we were outlining parallel2, the corresponding alloca location would be parallel1. This causes a variety of bugs, including #54165 as one example. This PR allows the stack allocation to be created at the correct allocation block, and thus remedies such issues. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D121061
Fixed by https://reviews.llvm.org/D121061 |
… parallel The OpenMPIRBuilder has a bug. Specifically, suppose you have two nested openmp parallel regions (writing with MLIR for ease) ``` omp.parallel { %a = ... omp.parallel { use(%a) } } ``` As OpenMP only permits pointer-like inputs, the builder will wrap all of the inputs into a stack allocation, and then pass this allocation to the inner parallel. For example, we would want to get something like the following: ``` omp.parallel { %a = ... %tmp = alloc store %tmp[] = %a kmpc_fork(outlined, %tmp) } ``` However, in practice, this is not what currently occurs in the context of nested parallel regions. Specifically to the OpenMPIRBuilder, the entirety of the function (at the LLVM level) is currently inlined with blocks marking the corresponding start and end of each region. ``` entry: ... parallel1: %a = ... ... parallel2: use(%a) ... endparallel2: ... endparallel1: ... ``` When the allocation is inserted, it presently inserted into the parent of the entire function (e.g. entry) rather than the parent allocation scope to the function being outlined. If we were outlining parallel2, the corresponding alloca location would be parallel1. This causes a variety of bugs, including llvm/llvm-project#54165 as one example. This PR allows the stack allocation to be created at the correct allocation block, and thus remedies such issues. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D121061
Running the code results in incorrectly nondeterministic behavior.
cc @ftynse @jdoerfert
The text was updated successfully, but these errors were encountered: