diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index ac56df3823e20..133233ded42e5 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12603,6 +12603,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace { auto CheckAddressSpace = [&](Value &Obj) { if (isa(&Obj)) return true; + // Some targets relax the requirement for alloca to be in an exact address + // space, allowing it in certain other address spaces instead. These + // targets later lower alloca to the correct address space in the + // pipeline. Therefore, we need to query TTI to determine the appropriate + // address space. + if (auto *AI = dyn_cast(&Obj)) { + Function *Fn = AI->getFunction(); + auto *TTI = + A.getInfoCache().getAnalysisResultForFunction( + *Fn); + return takeAddressSpace(TTI->getAssumedAddrSpace(AI)); + } // If an argument in flat address space only has addrspace cast uses, and // those casts are same, then we take the dst addrspace. if (auto *Arg = dyn_cast(&Obj)) { diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll index 10e521bbfcc10..b0b7c26f55575 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -906,11 +906,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -929,6 +931,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; AMDGPU-NEXT: ret void @@ -975,17 +979,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -999,12 +1004,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__2_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.check1: ; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.check3: ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -1012,13 +1017,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1030,9 +1036,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1046,6 +1055,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; AMDGPU-NEXT: ret void ; @@ -1058,6 +1069,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1069,6 +1083,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1081,6 +1097,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1093,17 +1112,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -1117,18 +1137,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__17_wrapper ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.check1: ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.check3: ; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute5: -; AMDGPU-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.check6: ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -1136,13 +1156,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1154,8 +1175,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1190,6 +1214,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1202,6 +1228,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1234,17 +1263,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -1258,28 +1288,29 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.check1: ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__8_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.end: ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1291,9 +1322,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1306,6 +1340,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1318,6 +1354,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1329,6 +1368,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1341,6 +1382,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1353,17 +1397,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -1377,12 +1422,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__10_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.check1: ; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.check3: ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -1390,13 +1435,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1408,9 +1454,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1423,6 +1472,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1435,6 +1486,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1446,6 +1500,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1458,6 +1514,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1470,17 +1529,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -1494,12 +1554,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__13_wrapper.ID ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.check1: ; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU: worker_state_machine.parallel_region.check3: ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -1507,13 +1567,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1525,9 +1586,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__12 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1540,6 +1604,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1552,6 +1618,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1563,6 +1632,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1575,6 +1646,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1586,11 +1660,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1604,6 +1680,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; AMDGPU-NEXT: ret void @@ -1614,15 +1692,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; AMDGPU-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; AMDGPU: if.then: ; AMDGPU-NEXT: br label [[RETURN:%.*]] ; AMDGPU: if.end: -; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; AMDGPU-NEXT: br label [[RETURN]] @@ -1658,17 +1739,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: is_worker_check: ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 @@ -1679,19 +1761,20 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU: worker_state_machine.is_active.check: ; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; AMDGPU: worker_state_machine.parallel_region.end: ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: ret void @@ -1705,6 +1788,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; AMDGPU-NEXT: ret void ; @@ -1722,6 +1807,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1734,6 +1821,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1745,6 +1835,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1757,6 +1849,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1788,6 +1883,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-NEXT: ret void ; @@ -1800,6 +1897,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-NEXT: ret void @@ -1811,11 +1911,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -1834,6 +1936,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; NVPTX-NEXT: ret void @@ -1880,17 +1984,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -1903,12 +2008,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__2_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.check1: ; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.check3: ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -1916,13 +2021,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -1934,9 +2040,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1950,6 +2059,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; NVPTX-NEXT: ret void ; @@ -1962,6 +2073,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -1973,6 +2087,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -1985,6 +2101,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -1997,17 +2116,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -2020,18 +2140,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__17_wrapper ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.check1: ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.check3: ; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute5: -; NVPTX-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.check6: ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -2039,13 +2159,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2057,8 +2178,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2093,6 +2217,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2105,6 +2231,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2137,17 +2266,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -2160,28 +2290,29 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.check1: ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__8_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.end: ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2193,9 +2324,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2208,6 +2342,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2220,6 +2356,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2231,6 +2370,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2243,6 +2384,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2255,17 +2399,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -2278,12 +2423,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__10_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.check1: ; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.check3: ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -2291,13 +2436,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2309,9 +2455,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2324,6 +2473,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2336,6 +2487,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2347,6 +2501,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2359,6 +2515,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2371,17 +2530,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -2394,12 +2554,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__13_wrapper.ID ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.check1: ; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX: worker_state_machine.parallel_region.check3: ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] @@ -2407,13 +2567,14 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2425,9 +2586,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__12 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2440,6 +2604,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2452,6 +2618,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2463,6 +2632,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2475,6 +2646,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2486,11 +2660,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2504,6 +2680,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; NVPTX-NEXT: ret void @@ -2514,15 +2692,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; NVPTX-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; NVPTX: if.then: ; NVPTX-NEXT: br label [[RETURN:%.*]] ; NVPTX: if.end: -; NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; NVPTX-NEXT: br label [[RETURN]] @@ -2558,17 +2739,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: is_worker_check: ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] +; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP1]], [[BLOCK_SIZE]] ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null @@ -2578,19 +2760,20 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX: worker_state_machine.is_active.check: ; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; NVPTX: worker_state_machine.parallel_region.end: ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP1]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: ret void @@ -2604,6 +2787,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; NVPTX-NEXT: ret void ; @@ -2621,6 +2806,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2633,6 +2820,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2644,6 +2834,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2656,6 +2848,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2687,6 +2882,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-NEXT: ret void ; @@ -2699,6 +2896,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-NEXT: ret void @@ -2710,11 +2910,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -2733,6 +2935,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; AMDGPU-DISABLED-NEXT: ret void @@ -2778,11 +2982,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -2794,9 +3000,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2810,6 +3019,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -2822,6 +3033,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -2833,6 +3047,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -2845,6 +3061,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -2856,11 +3075,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -2872,8 +3093,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2908,6 +3132,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -2920,6 +3146,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -2951,11 +3180,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -2967,9 +3198,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2982,6 +3216,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -2994,6 +3230,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3005,6 +3244,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3017,6 +3258,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3028,11 +3272,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -3044,9 +3290,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 ; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3059,6 +3308,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3071,6 +3322,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3082,6 +3336,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3094,6 +3350,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3105,11 +3364,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -3121,9 +3382,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12 ; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3136,6 +3400,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3148,6 +3414,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3159,6 +3428,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3171,6 +3442,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3182,11 +3456,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -3200,6 +3476,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3210,15 +3488,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; AMDGPU-DISABLED: if.then: ; AMDGPU-DISABLED-NEXT: br label [[RETURN:%.*]] ; AMDGPU-DISABLED: if.end: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; AMDGPU-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: br label [[RETURN]] @@ -3253,11 +3534,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED-NEXT: ret void @@ -3271,6 +3554,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3288,6 +3573,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3300,6 +3587,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3311,6 +3601,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3323,6 +3615,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3354,6 +3649,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU-DISABLED-NEXT: ret void ; @@ -3366,6 +3663,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: ret void @@ -3377,11 +3677,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3400,6 +3702,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; NVPTX-DISABLED-NEXT: ret void @@ -3445,11 +3749,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3461,9 +3767,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -3477,6 +3786,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3489,6 +3800,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3500,6 +3814,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3512,6 +3828,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3523,11 +3842,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3539,8 +3860,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -3575,6 +3899,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3587,6 +3913,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3618,11 +3947,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3634,9 +3965,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3649,6 +3983,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3661,6 +3997,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3672,6 +4011,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3684,6 +4025,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3695,11 +4039,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3711,9 +4057,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 ; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3726,6 +4075,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3738,6 +4089,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3749,6 +4103,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3761,6 +4117,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3772,11 +4131,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3788,9 +4149,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12 ; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3803,6 +4167,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3815,6 +4181,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3826,6 +4195,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3838,6 +4209,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3849,11 +4223,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3867,6 +4243,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; NVPTX-DISABLED-NEXT: ret void @@ -3877,15 +4255,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; NVPTX-DISABLED: if.then: ; NVPTX-DISABLED-NEXT: br label [[RETURN:%.*]] ; NVPTX-DISABLED: if.end: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; NVPTX-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; NVPTX-DISABLED-NEXT: br label [[RETURN]] @@ -3920,11 +4301,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED-NEXT: ret void @@ -3938,6 +4321,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3955,6 +4340,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3967,6 +4354,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -3978,6 +4368,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -3990,6 +4382,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void @@ -4021,6 +4416,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR11]] ; NVPTX-DISABLED-NEXT: ret void ; @@ -4033,6 +4430,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll index 9576ff6ca6aee..fa36db3ccef3f 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll @@ -915,11 +915,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -933,6 +935,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; AMDGPU1-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; AMDGPU1-NEXT: ret void @@ -978,11 +982,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -994,9 +1000,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU1-NEXT: entry: +; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1010,6 +1019,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; AMDGPU1-NEXT: ret void ; @@ -1022,6 +1033,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1033,6 +1047,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1045,6 +1061,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1056,11 +1075,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1072,8 +1093,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU1-NEXT: entry: +; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU1-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; AMDGPU1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1108,6 +1132,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1120,6 +1146,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1151,11 +1180,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1167,9 +1198,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU1-NEXT: entry: +; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU1-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1182,6 +1216,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1194,6 +1230,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1205,6 +1244,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1217,6 +1258,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1228,11 +1272,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1244,9 +1290,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_outlined__9 ; AMDGPU1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU1-NEXT: entry: +; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1259,6 +1308,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1271,6 +1322,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1282,6 +1336,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1294,6 +1350,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1305,11 +1364,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1321,9 +1382,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_outlined__12 ; AMDGPU1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU1-NEXT: entry: +; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1336,6 +1400,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1348,6 +1414,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1359,6 +1428,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1371,6 +1442,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1382,11 +1456,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1400,6 +1476,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; AMDGPU1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; AMDGPU1-NEXT: ret void @@ -1410,15 +1488,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU1-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; AMDGPU1-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; AMDGPU1-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; AMDGPU1: if.then: ; AMDGPU1-NEXT: br label [[RETURN:%.*]] ; AMDGPU1: if.end: -; AMDGPU1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; AMDGPU1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; AMDGPU1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; AMDGPU1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; AMDGPU1-NEXT: br label [[RETURN]] @@ -1453,11 +1534,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; AMDGPU1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU1: user_code.entry: -; AMDGPU1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU1-NEXT: ret void @@ -1471,6 +1554,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; AMDGPU1-NEXT: ret void ; @@ -1488,6 +1573,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1500,6 +1587,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1511,6 +1601,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1523,6 +1615,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1554,6 +1649,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: entry: ; AMDGPU1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU1-NEXT: ret void ; @@ -1566,6 +1663,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU1-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU1-NEXT: ret void @@ -1577,11 +1677,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1595,6 +1697,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; NVPTX1-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; NVPTX1-NEXT: ret void @@ -1640,11 +1744,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1656,9 +1762,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX1-NEXT: entry: +; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1672,6 +1781,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; NVPTX1-NEXT: ret void ; @@ -1684,6 +1795,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1695,6 +1809,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p1() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1707,6 +1823,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1718,11 +1837,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1734,8 +1855,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX1-NEXT: entry: +; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX1-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; NVPTX1-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -1770,6 +1894,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p1() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1782,6 +1908,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1813,11 +1942,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1829,9 +1960,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX1-NEXT: entry: +; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX1-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1844,6 +1978,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1856,6 +1992,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1867,6 +2006,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p1() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1879,6 +2020,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1890,11 +2034,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1906,9 +2052,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-LABEL: define {{[^@]+}}@__omp_outlined__9 ; NVPTX1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX1-NEXT: entry: +; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1921,6 +2070,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1933,6 +2084,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1944,6 +2098,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p1() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -1956,6 +2112,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -1967,11 +2126,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -1983,9 +2144,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-LABEL: define {{[^@]+}}@__omp_outlined__12 ; NVPTX1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX1-NEXT: entry: +; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -1998,6 +2162,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -2010,6 +2176,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -2021,6 +2190,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p1() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -2033,6 +2204,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -2044,11 +2218,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -2062,6 +2238,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; NVPTX1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; NVPTX1-NEXT: ret void @@ -2072,15 +2250,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX1-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; NVPTX1-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; NVPTX1-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; NVPTX1: if.then: ; NVPTX1-NEXT: br label [[RETURN:%.*]] ; NVPTX1: if.end: -; NVPTX1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; NVPTX1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; NVPTX1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; NVPTX1-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; NVPTX1-NEXT: br label [[RETURN]] @@ -2115,11 +2296,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; NVPTX1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX1: user_code.entry: -; NVPTX1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: call void @__kmpc_target_deinit() ; NVPTX1-NEXT: ret void @@ -2133,6 +2316,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; NVPTX1-NEXT: ret void ; @@ -2150,6 +2335,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -2162,6 +2349,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -2173,6 +2363,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -2185,6 +2377,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -2216,6 +2411,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: entry: ; NVPTX1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @p0() #[[ATTR11]] ; NVPTX1-NEXT: ret void ; @@ -2228,6 +2425,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX1-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX1-NEXT: ret void @@ -2239,11 +2439,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2257,6 +2459,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; AMDGPU2-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; AMDGPU2-NEXT: ret void @@ -2302,11 +2506,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2318,9 +2524,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU2-NEXT: entry: +; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2334,6 +2543,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; AMDGPU2-NEXT: ret void ; @@ -2346,6 +2557,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2357,6 +2571,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2369,6 +2585,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2380,11 +2599,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2396,8 +2617,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU2-NEXT: entry: +; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU2-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; AMDGPU2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2432,6 +2656,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2444,6 +2670,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2475,11 +2704,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2491,9 +2722,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU2-NEXT: entry: +; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU2-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2506,6 +2740,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2518,6 +2754,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2529,6 +2768,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2541,6 +2782,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2552,11 +2796,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2568,9 +2814,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_outlined__9 ; AMDGPU2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU2-NEXT: entry: +; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2583,6 +2832,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2595,6 +2846,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2606,6 +2860,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2618,6 +2874,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2629,11 +2888,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2645,9 +2906,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_outlined__12 ; AMDGPU2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU2-NEXT: entry: +; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -2660,6 +2924,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2672,6 +2938,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2683,6 +2952,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2695,6 +2966,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2706,11 +2980,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2724,6 +3000,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; AMDGPU2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; AMDGPU2-NEXT: ret void @@ -2734,15 +3012,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU2-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; AMDGPU2-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; AMDGPU2-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; AMDGPU2: if.then: ; AMDGPU2-NEXT: br label [[RETURN:%.*]] ; AMDGPU2: if.end: -; AMDGPU2-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; AMDGPU2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; AMDGPU2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; AMDGPU2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; AMDGPU2-NEXT: br label [[RETURN]] @@ -2777,11 +3058,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; AMDGPU2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU2: user_code.entry: -; AMDGPU2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU2-NEXT: ret void @@ -2795,6 +3078,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; AMDGPU2-NEXT: ret void ; @@ -2812,6 +3097,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2824,6 +3111,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2835,6 +3125,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2847,6 +3139,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2878,6 +3173,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: entry: ; AMDGPU2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU2-NEXT: ret void ; @@ -2890,6 +3187,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU2-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU2-NEXT: ret void @@ -2901,11 +3201,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -2919,6 +3221,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; AMDGPU3-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; AMDGPU3-NEXT: ret void @@ -2964,11 +3268,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -2980,9 +3286,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU3-NEXT: entry: +; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -2996,6 +3305,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; AMDGPU3-NEXT: ret void ; @@ -3008,6 +3319,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3019,6 +3333,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3031,6 +3347,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3042,11 +3361,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3058,8 +3379,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU3-NEXT: entry: +; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU3-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; AMDGPU3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -3094,6 +3418,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3106,6 +3432,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3137,11 +3466,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3153,9 +3484,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU3-NEXT: entry: +; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU3-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3168,6 +3502,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3180,6 +3516,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3191,6 +3530,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3203,6 +3544,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3214,11 +3558,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3230,9 +3576,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_outlined__9 ; AMDGPU3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU3-NEXT: entry: +; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3245,6 +3594,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3257,6 +3608,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3268,6 +3622,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3280,6 +3636,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3291,11 +3650,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3307,9 +3668,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_outlined__12 ; AMDGPU3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU3-NEXT: entry: +; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; AMDGPU3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3322,6 +3686,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3334,6 +3700,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3345,6 +3714,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p1() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3357,6 +3728,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3368,11 +3742,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3386,6 +3762,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; AMDGPU3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; AMDGPU3-NEXT: ret void @@ -3396,15 +3774,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU3-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; AMDGPU3-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; AMDGPU3-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; AMDGPU3: if.then: ; AMDGPU3-NEXT: br label [[RETURN:%.*]] ; AMDGPU3: if.end: -; AMDGPU3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; AMDGPU3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; AMDGPU3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; AMDGPU3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; AMDGPU3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; AMDGPU3-NEXT: br label [[RETURN]] @@ -3439,11 +3820,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; AMDGPU3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; AMDGPU3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU3: user_code.entry: -; AMDGPU3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: call void @__kmpc_target_deinit() ; AMDGPU3-NEXT: ret void @@ -3457,6 +3840,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; AMDGPU3-NEXT: ret void ; @@ -3474,6 +3859,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3486,6 +3873,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3497,6 +3887,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3509,6 +3901,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3540,6 +3935,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: entry: ; AMDGPU3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; AMDGPU3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @p0() #[[ATTR11]] ; AMDGPU3-NEXT: ret void ; @@ -3552,6 +3949,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; AMDGPU3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; AMDGPU3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU3-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; AMDGPU3-NEXT: ret void @@ -3563,11 +3963,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3581,6 +3983,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; NVPTX2-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; NVPTX2-NEXT: ret void @@ -3626,11 +4030,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3642,9 +4048,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX2-NEXT: entry: +; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -3658,6 +4067,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; NVPTX2-NEXT: ret void ; @@ -3670,6 +4081,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3681,6 +4095,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p1() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3693,6 +4109,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3704,11 +4123,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3720,8 +4141,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX2-NEXT: entry: +; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX2-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; NVPTX2-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -3756,6 +4180,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p1() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3768,6 +4194,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3799,11 +4228,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3815,9 +4246,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX2-NEXT: entry: +; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX2-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3830,6 +4264,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3842,6 +4278,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3853,6 +4292,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p1() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3865,6 +4306,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3876,11 +4320,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3892,9 +4338,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-LABEL: define {{[^@]+}}@__omp_outlined__9 ; NVPTX2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX2-NEXT: entry: +; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3907,6 +4356,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3919,6 +4370,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3930,6 +4384,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p1() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3942,6 +4398,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -3953,11 +4412,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -3969,9 +4430,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-LABEL: define {{[^@]+}}@__omp_outlined__12 ; NVPTX2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX2-NEXT: entry: +; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -3984,6 +4448,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -3996,6 +4462,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -4007,6 +4476,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p1() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -4019,6 +4490,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -4030,11 +4504,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -4048,6 +4524,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; NVPTX2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; NVPTX2-NEXT: ret void @@ -4058,15 +4536,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX2-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; NVPTX2-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; NVPTX2-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; NVPTX2: if.then: ; NVPTX2-NEXT: br label [[RETURN:%.*]] ; NVPTX2: if.end: -; NVPTX2-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; NVPTX2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; NVPTX2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; NVPTX2-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; NVPTX2-NEXT: br label [[RETURN]] @@ -4101,11 +4582,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; NVPTX2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX2: user_code.entry: -; NVPTX2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: call void @__kmpc_target_deinit() ; NVPTX2-NEXT: ret void @@ -4119,6 +4602,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; NVPTX2-NEXT: ret void ; @@ -4136,6 +4621,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -4148,6 +4635,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -4159,6 +4649,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -4171,6 +4663,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -4202,6 +4697,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: entry: ; NVPTX2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @p0() #[[ATTR11]] ; NVPTX2-NEXT: ret void ; @@ -4214,6 +4711,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX2-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX2-NEXT: ret void @@ -4225,11 +4725,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4243,6 +4745,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9:[0-9]+]] ; NVPTX3-NEXT: call void @unknown_no_openmp() #[[ATTR10:[0-9]+]] ; NVPTX3-NEXT: ret void @@ -4288,11 +4792,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4304,9 +4810,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX3-NEXT: entry: +; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -4320,6 +4829,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11:[0-9]+]] ; NVPTX3-NEXT: ret void ; @@ -4332,6 +4843,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__2(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4343,6 +4857,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p1() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4355,6 +4871,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4366,11 +4885,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4382,8 +4903,11 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX3-NEXT: entry: +; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX3-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR9]] ; NVPTX3-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR9]] @@ -4418,6 +4942,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p1() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4430,6 +4956,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4461,11 +4990,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4477,9 +5008,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX3-NEXT: entry: +; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX3-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR11]] ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__8, ptr @__omp_outlined__8_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -4492,6 +5026,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4504,6 +5040,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4515,6 +5054,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p1() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4527,6 +5068,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__8(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4538,11 +5082,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__9(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4554,9 +5100,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-LABEL: define {{[^@]+}}@__omp_outlined__9 ; NVPTX3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX3-NEXT: entry: +; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__10, ptr @__omp_outlined__10_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__11, ptr @__omp_outlined__11_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -4569,6 +5118,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4581,6 +5132,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__10(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4592,6 +5146,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p1() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4604,6 +5160,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__11(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4615,11 +5174,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__12(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4631,9 +5192,12 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-LABEL: define {{[^@]+}}@__omp_outlined__12 ; NVPTX3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX3-NEXT: entry: +; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 ; NVPTX3-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @unknown_no_openmp() #[[ATTR10]] ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__13, ptr @__omp_outlined__13_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 undef, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__14, ptr @__omp_outlined__14_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) @@ -4646,6 +5210,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4658,6 +5224,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__13(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4669,6 +5238,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p1() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4681,6 +5252,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__14(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4692,11 +5266,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__15(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4710,6 +5286,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: [[CALL:%.*]] = call i32 @omp_get_thread_num() #[[ATTR9]] ; NVPTX3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR9]] ; NVPTX3-NEXT: ret void @@ -4720,15 +5298,18 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-SAME: (i32 [[A:%.*]]) #[[ATTR6]] { ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX3-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: store i32 [[A]], ptr addrspace(5) [[TMP0]], align 4 +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; NVPTX3-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0 ; NVPTX3-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; NVPTX3: if.then: ; NVPTX3-NEXT: br label [[RETURN:%.*]] ; NVPTX3: if.end: -; NVPTX3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 -; NVPTX3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 +; NVPTX3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 1 ; NVPTX3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR9]] ; NVPTX3-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR9]] ; NVPTX3-NEXT: br label [[RETURN]] @@ -4763,11 +5344,13 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) -; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]]) +; NVPTX3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 ; NVPTX3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX3: user_code.entry: -; NVPTX3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR3]] +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__omp_outlined__16(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: call void @__kmpc_target_deinit() ; NVPTX3-NEXT: ret void @@ -4781,6 +5364,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @weak_callee_empty() #[[ATTR9]] ; NVPTX3-NEXT: ret void ; @@ -4798,6 +5383,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4810,6 +5397,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__17(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4821,6 +5411,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4833,6 +5425,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__18(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void @@ -4864,6 +5459,8 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: entry: ; NVPTX3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; NVPTX3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @p0() #[[ATTR11]] ; NVPTX3-NEXT: ret void ; @@ -4876,6 +5473,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX3-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; NVPTX3-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) ; NVPTX3-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX3-NEXT: call void @__omp_outlined__19(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; NVPTX3-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll index 1679a27fdae8b..834c623d27695 100644 --- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll +++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll @@ -64,7 +64,8 @@ define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l13 ; CHECK-NEXT: br label [[_Z3FOOI_INTERNALIZED_EXIT]] ; CHECK: _Z3fooi.internalized.exit: ; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR2]] -; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5) +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr addrspace(5) [[TMP4]], align 8 ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) ; CHECK-NEXT: call void @__kmpc_target_deinit() @@ -109,7 +110,8 @@ define hidden void @_Z3fooi(i32 noundef %i1) local_unnamed_addr #1 { ; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]] ; CHECK-NEXT: store i32 [[I1:%.*]], ptr [[I]], align 16 -; CHECK-NEXT: store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; CHECK-NEXT: store ptr [[I]], ptr addrspace(5) [[TMP1]], align 8 ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1) ; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I]], i64 4) #[[ATTR2]] ; CHECK-NEXT: ret void @@ -141,7 +143,8 @@ define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l16 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) ; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(3) @i.i_shared, align 16 -; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5) +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr addrspace(5) [[TMP2]], align 8 ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1) ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) ; CHECK-NEXT: call void @__kmpc_target_deinit() @@ -175,7 +178,8 @@ define hidden void @_Z4foo1i(i32 noundef %i1) local_unnamed_addr #1 { ; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]] ; CHECK-NEXT: store i32 [[I1:%.*]], ptr [[I]], align 16 -; CHECK-NEXT: store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; CHECK-NEXT: store ptr [[I]], ptr addrspace(5) [[TMP1]], align 8 ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1) ; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I]], i64 4) #[[ATTR2]] ; CHECK-NEXT: ret void @@ -202,7 +206,8 @@ define internal void @__omp_outlined__(ptr noalias nocapture readnone %.global_t ; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: [[I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]] ; CHECK-NEXT: store i32 [[TMP0]], ptr [[I_I]], align 16 -; CHECK-NEXT: store ptr [[I_I]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5) +; CHECK-NEXT: store ptr [[I_I]], ptr addrspace(5) [[TMP2]], align 8 ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1) ; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I_I]], i64 4) #[[ATTR2]] ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]]) @@ -228,15 +233,17 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #5 { ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I_I:%.*]] = alloca [1 x ptr], align 8 ; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]] ; CHECK-NEXT: [[I_I_I:%.*]] = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]] ; CHECK-NEXT: store i32 [[TMP4]], ptr [[I_I_I]], align 16 -; CHECK-NEXT: store ptr [[I_I_I]], ptr [[CAPTURED_VARS_ADDRS_I_I]], align 8 -; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 1) +; CHECK-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I_I]] to ptr addrspace(5) +; CHECK-NEXT: store ptr [[I_I_I]], ptr addrspace(5) [[TMP7]], align 8 +; CHECK-NEXT: call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 1) ; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[I_I_I]], i64 4) #[[ATTR2]] ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]]) ; CHECK-NEXT: ret void @@ -287,7 +294,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #5 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]]) -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP4]], 1 diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll index 29f2030c4d42b..17d17fe55d42f 100644 --- a/llvm/test/Transforms/OpenMP/remove_globalization.ll +++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll @@ -163,17 +163,22 @@ define internal void @convert_and_move_alloca() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-NEXT: [[IV_PTR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[UB_PTR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[UB_PTR]] to ptr addrspace(5) ; CHECK-NEXT: br label [[INITLOOP:%.*]] ; CHECK: initloop: -; CHECK-NEXT: store i32 0, ptr [[IV_PTR]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP1]], align 4 ; CHECK-NEXT: br label [[LOOPBODY:%.*]] ; CHECK: loopbody: -; CHECK-NEXT: [[IV:%.*]] = load i32, ptr [[IV_PTR]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[IV]], 10 -; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOPINC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[IV:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[IV]], 10 +; CHECK-NEXT: br i1 [[TMP3]], label [[EXIT:%.*]], label [[LOOPINC:%.*]] ; CHECK: loopinc: ; CHECK-NEXT: [[INC:%.*]] = add i32 [[IV]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[IV_PTR]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-NEXT: store i32 [[INC]], ptr addrspace(5) [[TMP4]], align 4 ; CHECK-NEXT: br label [[LOOPBODY]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -183,17 +188,22 @@ define internal void @convert_and_move_alloca() { ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 ; CHECK-DISABLED-NEXT: [[IV_PTR:%.*]] = alloca i32, align 4 +; CHECK-DISABLED-NEXT: [[UB_PTR:%.*]] = alloca i32, align 4 +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[UB_PTR]] to ptr addrspace(5) ; CHECK-DISABLED-NEXT: br label [[INITLOOP:%.*]] ; CHECK-DISABLED: initloop: -; CHECK-DISABLED-NEXT: store i32 0, ptr [[IV_PTR]], align 4 +; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-DISABLED-NEXT: store i32 0, ptr addrspace(5) [[TMP1]], align 4 ; CHECK-DISABLED-NEXT: br label [[LOOPBODY:%.*]] ; CHECK-DISABLED: loopbody: -; CHECK-DISABLED-NEXT: [[IV:%.*]] = load i32, ptr [[IV_PTR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = icmp eq i32 [[IV]], 10 -; CHECK-DISABLED-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOPINC:%.*]] +; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-DISABLED-NEXT: [[IV:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4 +; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[IV]], 10 +; CHECK-DISABLED-NEXT: br i1 [[TMP3]], label [[EXIT:%.*]], label [[LOOPINC:%.*]] ; CHECK-DISABLED: loopinc: ; CHECK-DISABLED-NEXT: [[INC:%.*]] = add i32 [[IV]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], ptr [[IV_PTR]], align 4 +; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5) +; CHECK-DISABLED-NEXT: store i32 [[INC]], ptr addrspace(5) [[TMP4]], align 4 ; CHECK-DISABLED-NEXT: br label [[LOOPBODY]] ; CHECK-DISABLED: exit: ; CHECK-DISABLED-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll index 1a629ecfee06d..c364177f01e53 100644 --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -234,7 +234,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -251,7 +253,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -302,7 +306,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -319,7 +325,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -369,7 +377,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -386,7 +396,9 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -425,8 +437,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -443,8 +456,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -461,8 +475,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -479,8 +494,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -497,8 +513,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -515,8 +532,9 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; @@ -591,6 +609,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void @@ -601,6 +621,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: ret void @@ -611,6 +633,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void @@ -621,6 +645,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void @@ -631,6 +657,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void @@ -641,6 +669,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void @@ -670,7 +700,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -687,7 +719,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -738,7 +772,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -755,7 +791,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -805,7 +843,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -822,7 +862,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -864,8 +906,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -884,8 +927,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -905,8 +949,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -926,8 +971,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -946,8 +992,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -966,8 +1013,9 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; @@ -1044,6 +1092,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void @@ -1054,6 +1104,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: ret void @@ -1064,6 +1116,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void @@ -1074,6 +1128,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void @@ -1084,6 +1140,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void @@ -1094,6 +1152,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void @@ -1124,7 +1184,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -1141,7 +1203,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -1192,7 +1256,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1209,7 +1275,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1259,7 +1327,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1276,7 +1346,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1315,9 +1387,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1334,9 +1408,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1353,9 +1429,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1372,9 +1450,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1391,9 +1471,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1410,9 +1492,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -1511,10 +1595,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -1523,10 +1610,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -1535,10 +1625,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-DISABLED1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -1547,10 +1640,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-DISABLED2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -1559,10 +1655,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-DISABLED1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-DISABLED1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -1571,10 +1670,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-DISABLED2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-DISABLED2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void ; entry: @@ -1604,7 +1706,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -1621,7 +1725,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -1672,7 +1778,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1689,7 +1797,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1739,7 +1849,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1756,7 +1868,9 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_s ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1809,9 +1923,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[TMP2]], align 8, !tbaa [[TBAA20]] +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -1842,9 +1958,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[TMP2]], align 8, !tbaa [[TBAA20]] +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -1862,9 +1980,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -1882,9 +2002,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -1902,9 +2024,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -1922,9 +2046,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID_]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4, !tbaa [[TBAA12]] +; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; @@ -2024,10 +2150,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper @@ -2036,10 +2165,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper @@ -2048,10 +2180,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-DISABLED1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper @@ -2060,10 +2195,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; AMDGPU-DISABLED2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper @@ -2072,10 +2210,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-DISABLED1-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-DISABLED1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper @@ -2084,10 +2225,13 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] +; NVPTX-DISABLED2-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-DISABLED2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void ; entry: @@ -2147,6 +2291,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -2192,6 +2338,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -2238,6 +2386,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -2254,6 +2404,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -2299,6 +2451,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -2315,6 +2469,8 @@ define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_targe ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -2816,6 +2972,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void @@ -2826,6 +2984,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: ret void @@ -2836,6 +2996,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void @@ -2846,6 +3008,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void @@ -2856,6 +3020,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void @@ -2866,6 +3032,8 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll index 2f1aadc073142..10804a5e270f1 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -72,17 +72,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK: user_code.entry: ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[N]], 42 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) -; CHECK-NEXT: store ptr [[SELECT]], ptr [[LOC]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[LOC]] to ptr addrspace(5) +; CHECK-NEXT: store ptr [[SELECT]], ptr addrspace(5) [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 ; CHECK-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 ; CHECK-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32 ; CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] ; CHECK: region.check.tid: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: ; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] @@ -91,7 +92,7 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP2]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP3]]) ; CHECK-NEXT: br label [[REGION_EXIT:%.*]] ; CHECK: region.exit: ; CHECK-NEXT: call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]] @@ -107,18 +108,19 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID5:%.*]] ; CHECK: region.check.tid5: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] ; CHECK: region.guarded4: ; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END1:%.*]] ; CHECK: region.guarded.end1: ; CHECK-NEXT: br label [[REGION_BARRIER2]] ; CHECK: region.barrier2: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP4]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP5]]) ; CHECK-NEXT: br label [[REGION_EXIT3]] ; CHECK: region.exit3: +; CHECK-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(5) ; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 ; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: __omp_outlined__.exit: @@ -128,16 +130,16 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID10:%.*]] ; CHECK: region.check.tid10: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] ; CHECK: region.guarded9: ; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END6:%.*]] ; CHECK: region.guarded.end6: ; CHECK-NEXT: br label [[REGION_BARRIER7]] ; CHECK: region.barrier7: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]]) ; CHECK-NEXT: br label [[REGION_EXIT8:%.*]] ; CHECK: region.exit8: ; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -145,16 +147,16 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID15:%.*]] ; CHECK: region.check.tid15: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] ; CHECK: region.guarded14: ; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END11:%.*]] ; CHECK: region.guarded.end11: ; CHECK-NEXT: br label [[REGION_BARRIER12]] ; CHECK: region.barrier12: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]]) ; CHECK-NEXT: br label [[REGION_EXIT13:%.*]] ; CHECK: region.exit13: ; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -162,16 +164,16 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID20:%.*]] ; CHECK: region.check.tid20: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] ; CHECK: region.guarded19: ; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END16:%.*]] ; CHECK: region.guarded.end16: ; CHECK-NEXT: br label [[REGION_BARRIER17]] ; CHECK: region.barrier17: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP12]]) ; CHECK-NEXT: br label [[REGION_EXIT18:%.*]] ; CHECK: region.exit18: ; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -228,8 +230,9 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED: user_code.entry: ; CHECK-DISABLED-NEXT: [[C:%.*]] = icmp eq i64 [[N]], 42 ; CHECK-DISABLED-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) -; CHECK-DISABLED-NEXT: store ptr [[SELECT]], ptr [[LOC]], align 8 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] +; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[LOC]] to ptr addrspace(5) +; CHECK-DISABLED-NEXT: store ptr [[SELECT]], ptr addrspace(5) [[TMP1]], align 8 +; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 ; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] @@ -249,6 +252,7 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] ; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(5) ; CHECK-DISABLED-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-DISABLED: __omp_outlined__.exit: diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll index a644fe1b2f821..3a3ecafc32b86 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll @@ -195,6 +195,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; CHECK-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; CHECK-NEXT: ret void ; @@ -203,6 +205,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; CHECK-DISABLE-SPMDIZATION-NEXT: entry: ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void ; @@ -224,6 +228,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 { ; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; CHECK-NEXT: call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; CHECK-NEXT: ret void @@ -235,6 +242,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 { ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll index ef8caf48e57b7..d61600fefbdd2 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll @@ -56,22 +56,24 @@ define internal void @spmd_callees__debug(i1 %c) { ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] -; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 -; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 -; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; AMDGPU: 3: -; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; AMDGPU-NEXT: br label [[TMP7:%.*]] -; AMDGPU: 4: -; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; AMDGPU-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 +; AMDGPU-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; AMDGPU: 5: -; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; AMDGPU-NEXT: br label [[TMP7]] +; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; AMDGPU-NEXT: br label [[TMP9:%.*]] ; AMDGPU: 6: -; AMDGPU-NEXT: unreachable +; AMDGPU-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; AMDGPU: 7: +; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; AMDGPU-NEXT: br label [[TMP9]] +; AMDGPU: 8: +; AMDGPU-NEXT: unreachable +; AMDGPU: 9: ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] ; @@ -87,22 +89,24 @@ define internal void @spmd_callees__debug(i1 %c) { ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] -; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 -; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 -; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; NVPTX: 3: -; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; NVPTX-NEXT: br label [[TMP7:%.*]] -; NVPTX: 4: -; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; NVPTX-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 +; NVPTX-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; NVPTX: 5: -; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; NVPTX-NEXT: br label [[TMP7]] +; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; NVPTX-NEXT: br label [[TMP9:%.*]] ; NVPTX: 6: -; NVPTX-NEXT: unreachable +; NVPTX-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; NVPTX: 7: +; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; NVPTX-NEXT: br label [[TMP9]] +; NVPTX: 8: +; NVPTX-NEXT: unreachable +; NVPTX: 9: ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] ; @@ -217,6 +221,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] ; AMDGPU-NEXT: ret void @@ -227,6 +233,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] ; NVPTX-NEXT: ret void @@ -340,6 +348,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] ; AMDGPU-NEXT: ret void @@ -350,6 +360,8 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] ; NVPTX-NEXT: ret void @@ -412,22 +424,24 @@ define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 { ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable -; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable -; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; AMDGPU: 3: -; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; AMDGPU-NEXT: br label [[TMP7:%.*]] -; AMDGPU: 4: -; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; AMDGPU-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable +; AMDGPU-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; AMDGPU: 5: -; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; AMDGPU-NEXT: br label [[TMP7]] +; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; AMDGPU-NEXT: br label [[TMP9:%.*]] ; AMDGPU: 6: -; AMDGPU-NEXT: unreachable +; AMDGPU-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; AMDGPU: 7: +; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; AMDGPU-NEXT: br label [[TMP9]] +; AMDGPU: 8: +; AMDGPU-NEXT: unreachable +; AMDGPU: 9: ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] ; @@ -472,22 +486,24 @@ define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 { ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable -; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable -; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; NVPTX: 3: -; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; NVPTX-NEXT: br label [[TMP7:%.*]] -; NVPTX: 4: -; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; NVPTX-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable +; NVPTX-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; NVPTX: 5: -; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] -; NVPTX-NEXT: br label [[TMP7]] +; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; NVPTX-NEXT: br label [[TMP9:%.*]] ; NVPTX: 6: -; NVPTX-NEXT: unreachable +; NVPTX-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; NVPTX: 7: +; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR10]] +; NVPTX-NEXT: br label [[TMP9]] +; NVPTX: 8: +; NVPTX-NEXT: unreachable +; NVPTX: 9: ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] ; @@ -530,9 +546,10 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p ; AMDGPU-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; AMDGPU-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; AMDGPU-NEXT: store ptr [[X]], ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -551,9 +568,10 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p ; NVPTX-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) +; NVPTX-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5) +; NVPTX-NEXT: store ptr [[X]], ptr addrspace(5) [[TMP0]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; @@ -620,10 +638,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] +; AMDGPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; AMDGPU-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; AMDGPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR10]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper @@ -632,10 +653,13 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] -; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] +; NVPTX-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5) +; NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[TMP4]], align 8 +; NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20]] +; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP6]]) #[[ATTR10]] ; NVPTX-NEXT: ret void ; entry: @@ -667,8 +691,10 @@ define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 { ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -685,8 +711,10 @@ define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 { ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -756,21 +784,23 @@ define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] -; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external -; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; AMDGPU: 3: -; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) -; AMDGPU-NEXT: br label [[TMP7:%.*]] -; AMDGPU: 4: -; AMDGPU-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; AMDGPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; AMDGPU-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] +; AMDGPU-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external +; AMDGPU-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; AMDGPU: 5: -; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) -; AMDGPU-NEXT: br label [[TMP7]] +; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) +; AMDGPU-NEXT: br label [[TMP9:%.*]] ; AMDGPU: 6: -; AMDGPU-NEXT: unreachable +; AMDGPU-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; AMDGPU: 7: +; AMDGPU-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) +; AMDGPU-NEXT: br label [[TMP9]] +; AMDGPU: 8: +; AMDGPU-NEXT: unreachable +; AMDGPU: 9: ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] ; @@ -815,21 +845,23 @@ define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] -; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] -; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external -; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] -; NVPTX: 3: -; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) -; NVPTX-NEXT: br label [[TMP7:%.*]] -; NVPTX: 4: -; NVPTX-NEXT: br i1 true, label [[TMP5:%.*]], label [[TMP6:%.*]] +; NVPTX-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 0, ptr addrspace(5) [[TMP2]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTTHREADID_TEMP_]] to ptr addrspace(5) +; NVPTX-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[TMP3]], align 4, !tbaa [[TBAA12]] +; NVPTX-NEXT: [[TMP4:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external +; NVPTX-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]] ; NVPTX: 5: -; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) -; NVPTX-NEXT: br label [[TMP7]] +; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) +; NVPTX-NEXT: br label [[TMP9:%.*]] ; NVPTX: 6: -; NVPTX-NEXT: unreachable +; NVPTX-NEXT: br i1 true, label [[TMP7:%.*]], label [[TMP8:%.*]] ; NVPTX: 7: +; NVPTX-NEXT: call void @__omp_outlined_not_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) +; NVPTX-NEXT: br label [[TMP9]] +; NVPTX: 8: +; NVPTX-NEXT: unreachable +; NVPTX: 9: ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] ; diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll index 1cfce147ac81e..5b1470f985411 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll @@ -281,6 +281,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; CHECK-NEXT: call void @leaf() #[[ATTR8]] ; CHECK-NEXT: ret void ; @@ -289,6 +291,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; CHECK-DISABLE-SPMDIZATION-NEXT: entry: ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @leaf() #[[ATTR8]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void ; @@ -310,6 +314,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 { ; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; CHECK-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; CHECK-NEXT: call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR8]] ; CHECK-NEXT: ret void @@ -321,6 +328,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 { ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-DISABLE-SPMDIZATION-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5) +; CHECK-DISABLE-SPMDIZATION-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; CHECK-DISABLE-SPMDIZATION-NEXT: call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR8]] ; CHECK-DISABLE-SPMDIZATION-NEXT: ret void