diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 21abc346cf17a..81cf2ad9498a7 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have mo LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") +LANGOPT(OpenMPForceUSM , 1, 0, "Enable OpenMP unified shared memory mode via compiler.") LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.") LANGOPT(RenderScript , 1, 0, "RenderScript") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2b93ddf033499..28290da438c62 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3451,6 +3451,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group< Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>, HelpText<"Do not create a host fallback if offloading to the device fails.">, MarshallingInfoFlag>; +def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group, + Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>, + HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">, + MarshallingInfoFlag>; def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group, Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>, HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">; diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index ea6645a39e832..4855e7410a015 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) ? CGM.getLangOpts().OMPHostIRFile : StringRef{}); OMPBuilder.setConfig(Config); + + // The user forces the compiler to behave as if omp requires + // unified_shared_memory was given. + if (CGM.getLangOpts().OpenMPForceUSM) { + HasRequiresUnifiedSharedMemory = true; + OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true); + } } void CGOpenMPRuntime::clear() { diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index acfa119805068..ffc24201ab2e0 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6382,6 +6382,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) CmdArgs.push_back("-fopenmp-offload-mandatory"); + if (Args.hasArg(options::OPT_fopenmp_force_usm)) + CmdArgs.push_back("-fopenmp-force-usm"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c new file mode 100644 index 0000000000000..5c63a9a5e7004 --- /dev/null +++ b/clang/test/OpenMP/force-usm.c @@ -0,0 +1,79 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3 +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s +// expected-no-diagnostics + +extern "C" void *malloc(unsigned int b); + +int GI; +#pragma omp declare target +int *pGI; +#pragma omp end declare target + +int main(void) { + + GI = 0; + + pGI = (int *) malloc(sizeof(int)); + *pGI = 42; + +#pragma omp target map(pGI[:1], GI) + { + GI = 1; + *pGI = 2; + } + + return 0; +} + +// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25( +// CHECK-USM-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-USM-NEXT: entry: +// CHECK-USM-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-USM-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-USM-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-USM-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr +// CHECK-USM-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-USM-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8 +// CHECK-USM-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8 +// CHECK-USM-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]]) +// CHECK-USM-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-USM-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-USM: user_code.entry: +// CHECK-USM-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8 +// CHECK-USM-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 +// CHECK-USM-NEXT: store i32 2, ptr [[TMP3]], align 4 +// CHECK-USM-NEXT: call void @__kmpc_target_deinit() +// CHECK-USM-NEXT: ret void +// CHECK-USM: worker.exit: +// CHECK-USM-NEXT: ret void +// +// +// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25( +// CHECK-DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-DEFAULT-NEXT: entry: +// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-DEFAULT-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-DEFAULT-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr +// CHECK-DEFAULT-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-DEFAULT-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8 +// CHECK-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8 +// CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]]) +// CHECK-DEFAULT-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-DEFAULT-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-DEFAULT: user_code.entry: +// CHECK-DEFAULT-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8 +// CHECK-DEFAULT-NEXT: store i32 2, ptr [[TMP2]], align 4 +// CHECK-DEFAULT-NEXT: call void @__kmpc_target_deinit() +// CHECK-DEFAULT-NEXT: ret void +// CHECK-DEFAULT: worker.exit: +// CHECK-DEFAULT-NEXT: ret void +// diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 19c5e5c457222..3b9b9da4649db 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -185,6 +185,8 @@ for libomptarget_target in config.libomptarget_all_targets: "%libomptarget-compile-and-run-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compilexx-generic", "%libomptarget-compilexx-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm", + "%libomptarget-compilexxx-force-usm-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compile-generic", "%libomptarget-compile-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compile-fortran-generic", @@ -242,6 +244,9 @@ for libomptarget_target in config.libomptarget_all_targets: config.substitutions.append(("%libomptarget-compilexx-" + \ libomptarget_target, \ "%clangxx-" + libomptarget_target + add_libraries(" %s -o %t"))) + config.substitutions.append(("%libomptarget-compilexxx-force-usm-" + + libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \ + add_libraries(" %s -o %t"))) config.substitutions.append(("%libomptarget-compile-" + \ libomptarget_target, \ "%clang-" + libomptarget_target + add_libraries(" %s -o %t"))) @@ -279,6 +284,9 @@ for libomptarget_target in config.libomptarget_all_targets: config.substitutions.append(("%clangxx-" + libomptarget_target, \ "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\ remove_suffix_if_present(libomptarget_target))) + config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \ + "%clangxx %openmp_flags -fopenmp-force-usm %cuda_flags %flags %flags_clang -fopenmp-targets=" +\ + remove_suffix_if_present(libomptarget_target))) config.substitutions.append(("%clang-" + libomptarget_target, \ "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\ remove_suffix_if_present(libomptarget_target))) diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp new file mode 100644 index 0000000000000..5bddecd5b4675 --- /dev/null +++ b/openmp/libomptarget/test/offloading/force-usm.cpp @@ -0,0 +1,59 @@ +// clang-format off +// RUN: %libomptarget-compilexx-generic +// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM +// +// RUN: %libomptarget-compilexxx-generic-force-usm +// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM +// +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// clang-format on + +#include +#include +#include + +int GI; +#pragma omp declare target +int *pGI; +#pragma omp end declare target + +int main(void) { + + GI = 0; + // Implicit mappings + int alpha = 1; + int beta[3] = {2, 5, 8}; + + // Require map clauses for non-USM execution + pGI = (int *)malloc(sizeof(int)); + *pGI = 42; + +#pragma omp target map(pGI[ : 1], GI) + { + GI = 1 * alpha; + *pGI = 2 * beta[1]; + } + + assert(GI == 1); + assert(*pGI == 10); + + printf("SUCCESS\n"); + + return 0; +} + +// clang-format off +// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4 +// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12 +// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4 +// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI +// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4 +// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12 +// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4 +// NO-USM-NEXT: SUCCESS + +// FORCE-USM: SUCCESS +// +// clang-format on