diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 4c848ca76fb8d..62ed75797955e 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1465,3 +1465,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` + * Enable device malloc statistics (amdgpu only). ``0x8`` diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt index fee2414b456a1..f71bdeae3d7f0 100644 --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -83,6 +83,7 @@ endif() list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) set(include_files + ${include_directory}/Allocator.h ${include_directory}/Configuration.h ${include_directory}/Debug.h ${include_directory}/Interface.h @@ -95,6 +96,7 @@ set(include_files ) set(src_files + ${source_directory}/Allocator.cpp ${source_directory}/Configuration.cpp ${source_directory}/Debug.cpp ${source_directory}/Kernel.cpp diff --git a/openmp/libomptarget/DeviceRTL/include/Allocator.h b/openmp/libomptarget/DeviceRTL/include/Allocator.h new file mode 100644 index 0000000000000..a28eb0fb2977e --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Allocator.h @@ -0,0 +1,44 @@ +//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_ALLOCATOR_H +#define OMPTARGET_ALLOCATOR_H + +#include "Types.h" + +// Forward declaration. +struct KernelEnvironmentTy; + +#pragma omp begin declare target device_type(nohost) + +namespace ompx { + +namespace allocator { + +static uint64_t constexpr ALIGNMENT = 16; + +/// Initialize the allocator according to \p KernelEnvironment +void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); + +/// Allocate \p Size bytes. +[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * +alloc(uint64_t Size); + +/// Free the allocation pointed to by \p Ptr. +void free(void *Ptr); + +} // namespace allocator + +} // namespace ompx + +#pragma omp end declare target + +#endif diff --git a/openmp/libomptarget/DeviceRTL/src/Allocator.cpp b/openmp/libomptarget/DeviceRTL/src/Allocator.cpp new file mode 100644 index 0000000000000..7a4cbfe60dd02 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Allocator.cpp @@ -0,0 +1,80 @@ +//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "Allocator.h" +#include "Configuration.h" +#include "Environment.h" +#include "Mapping.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace ompx; + +#pragma omp begin declare target device_type(nohost) + +[[gnu::used, gnu::retain, gnu::weak, + gnu::visibility( + "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; +[[gnu::used, gnu::retain, gnu::weak, + gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy + __omp_rtl_device_memory_pool_tracker; + +/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool +/// directly. +struct BumpAllocatorTy final { + + void *alloc(uint64_t Size) { + Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); + + if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { + atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, + atomic::seq_cst); + atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, + atomic::seq_cst); + atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, + atomic::seq_cst); + atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, + atomic::seq_cst); + } + + uint64_t *Data = + reinterpret_cast(&__omp_rtl_device_memory_pool.Ptr); + uint64_t End = + reinterpret_cast(Data) + __omp_rtl_device_memory_pool.Size; + + uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); + if (OldData + Size > End) + __builtin_trap(); + + return reinterpret_cast(OldData); + } + + void free(void *) {} +}; + +BumpAllocatorTy BumpAllocator; + +/// allocator namespace implementation +/// +///{ + +void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { + // TODO: Check KernelEnvironment for an allocator choice as soon as we have + // more than one. +} + +void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } + +void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } + +///} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp index 222983577164c..91e8a00bdef9d 100644 --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "Allocator.h" #include "Debug.h" #include "Environment.h" #include "Interface.h" @@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD, synchronize::init(IsSPMD); mapping::init(IsSPMD); state::init(IsSPMD, KernelEnvironment); + allocator::init(IsSPMD, KernelEnvironment); } /// Simple generic state machine for worker threads. diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp index efa0502b82a5c..70f30e9ba120c 100644 --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -9,6 +9,8 @@ //===----------------------------------------------------------------------===// #include "State.h" +#include "Allocator.h" +#include "Configuration.h" #include "Debug.h" #include "Environment.h" #include "Interface.h" @@ -26,18 +28,16 @@ using namespace ompx; /// ///{ -/// Add worst-case padding so that future allocations are properly aligned. -/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be -/// passed in as an argument and the stack rewritten to support it. -constexpr const uint32_t Alignment = 16; - /// External symbol to access dynamic shared memory. -[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[]; +[[gnu::aligned( + allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[]; #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) /// The kernel environment passed to the init method by the compiler. static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); +///} + namespace { /// Fallback implementations are missing to trigger a link time error. @@ -45,29 +45,19 @@ namespace { /// dedicated begin/end declare variant. /// ///{ - extern "C" { -[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); -} +#ifdef __AMDGPU__ -///} +[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); } +[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } -/// AMDGCN implementations of the shuffle sync idiom. -/// -///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) +#else -extern "C" { -void *malloc(uint64_t Size) { - // TODO: Use some preallocated space for dynamic malloc. - return nullptr; -} +[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size); +[[gnu::weak, gnu::leaf]] void free(void *Ptr); -void free(void *Ptr) {} +#endif } - -#pragma omp end declare variant ///} /// A "smart" stack in shared memory. @@ -96,7 +86,7 @@ struct SharedMemorySmartStackTy { uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), - Alignment); + allocator::ALIGNMENT); } /// Return the top address of the warp data stack, that is the first address @@ -106,8 +96,10 @@ struct SharedMemorySmartStackTy { } /// The actual storage, shared among all warps. - [[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize]; - [[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; + [[gnu::aligned( + allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; + [[gnu::aligned( + allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; }; static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, @@ -122,7 +114,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) { void *SharedMemorySmartStackTy::push(uint64_t Bytes) { // First align the number of requested bytes. - uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); + /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to + /// be passed in as an argument and the stack rewritten to support it. + uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); uint32_t StorageTotal = computeThreadStorageTotal(); @@ -150,7 +144,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { } void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { - uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); + uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); if (utils::isSharedMemPtr(Ptr)) { int TId = mapping::getThreadIdInBlock(); Usage[TId] -= AlignedBytes; diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h index 48a0fa933bdd8..8194736ae4e0a 100644 --- a/openmp/libomptarget/include/Environment.h +++ b/openmp/libomptarget/include/Environment.h @@ -43,6 +43,27 @@ struct DeviceEnvironmentTy { uint64_t HardwareParallelism; }; +struct DeviceMemoryPoolTy { + void *Ptr; + uint64_t Size; +}; + +struct DeviceMemoryPoolTrackingTy { + uint64_t NumAllocations; + uint64_t AllocationTotal; + uint64_t AllocationMin; + uint64_t AllocationMax; + + void combine(DeviceMemoryPoolTrackingTy &Other) { + NumAllocations += Other.NumAllocations; + AllocationTotal += Other.AllocationTotal; + AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin + : AllocationMin; + AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax + : AllocationMax; + } +}; + // NOTE: Please don't change the order of those members as their indices are // used in the middle end. Always add the new data member at the end. // Different from KernelEnvironmentTy below, this structure contains members diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 148ca09654092..ab24856f9bc78 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2529,10 +2529,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } Error getDeviceHeapSize(uint64_t &Value) override { - Value = 0; + Value = DeviceMemoryPoolSize; + return Plugin::success(); + } + Error setDeviceHeapSize(uint64_t Value) override { + for (DeviceImageTy *Image : LoadedImages) + if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value)) + return Err; + DeviceMemoryPoolSize = Value; return Plugin::success(); } - Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } /// AMDGPU-specific function to get device attributes. template Error getDeviceAttr(uint32_t Kind, Ty &Value) { @@ -2625,6 +2631,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Reference to the host device. AMDHostDeviceTy &HostDevice; + + /// The current size of the global device memory pool (managed by us). + uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; }; Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index c8fb8d552f429..0243f0205dbf0 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -590,6 +590,35 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { } Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { + + if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + for (auto *Image : LoadedImages) { + DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0}; + GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", + sizeof(DeviceMemoryPoolTrackingTy), + &ImageDeviceMemoryPoolTracking); + if (auto Err = + GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) + return Err; + DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking); + } + + // TODO: Write this by default into a file. + printf("\n\n|-----------------------\n" + "| Device memory tracker:\n" + "|-----------------------\n" + "| #Allocations: %lu\n" + "| Byes allocated: %lu\n" + "| Minimal allocation: %lu\n" + "| Maximal allocation: %lu\n" + "|-----------------------\n\n\n", + DeviceMemoryPoolTracking.NumAllocations, + DeviceMemoryPoolTracking.AllocationTotal, + DeviceMemoryPoolTracking.AllocationMin, + DeviceMemoryPoolTracking.AllocationMax); + } + // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) @@ -648,6 +677,17 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, if (auto Err = setupDeviceEnvironment(Plugin, *Image)) return std::move(Err); + // Setup the global device memory pool if needed. + if (shouldSetupDeviceMemoryPool()) { + uint64_t HeapSize; + auto SizeOrErr = getDeviceHeapSize(HeapSize); + if (SizeOrErr) { + REPORT("No global device memory pool due to error: %s\n", + toString(std::move(SizeOrErr)).data()); + } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize)) + return std::move(Err); + } + // Register all offload entries of the image. if (auto Err = registerOffloadEntries(*Image)) return std::move(Err); @@ -713,6 +753,45 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, return Plugin::success(); } +Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, + DeviceImageTy &Image, + uint64_t PoolSize) { + // Free the old pool, if any. + if (DeviceMemoryPool.Ptr) { + if (auto Err = dataDelete(DeviceMemoryPool.Ptr, + TargetAllocTy::TARGET_ALLOC_DEVICE)) + return Err; + } + + DeviceMemoryPool.Size = PoolSize; + auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr, + TargetAllocTy::TARGET_ALLOC_DEVICE); + if (AllocOrErr) { + DeviceMemoryPool.Ptr = *AllocOrErr; + } else { + auto Err = AllocOrErr.takeError(); + REPORT("Failure to allocate device memory for global memory pool: %s\n", + toString(std::move(Err)).data()); + DeviceMemoryPool.Ptr = nullptr; + DeviceMemoryPool.Size = 0; + } + + // Create the metainfo of the device environment global. + GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", + sizeof(DeviceMemoryPoolTrackingTy), + &DeviceMemoryPoolTracking); + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) + return Err; + + // Create the metainfo of the device environment global. + GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", + sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); + + // Write device environment values to the device. + return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal); +} + Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image) { // The plugin either does not need an RPC server or it is unavailible. @@ -1327,10 +1406,6 @@ Error GenericPluginTy::init() { } Error GenericPluginTy::deinit() { - // There is no global handler if no device is available. - if (GlobalHandler) - delete GlobalHandler; - // Deinitialize all active devices. for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { if (Devices[DeviceId]) { @@ -1340,6 +1415,10 @@ Error GenericPluginTy::deinit() { assert(!Devices[DeviceId] && "Device was not deinitialized"); } + // There is no global handler if no device is available. + if (GlobalHandler) + delete GlobalHandler; + if (RPCServer) delete RPCServer; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h index 4a9c223eefbcf..ddcf3b3cc9b95 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -625,6 +625,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// this behavior by overriding the shouldSetupDeviceEnvironment function. Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + /// Setup the global device memory pool, if the plugin requires one. + Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, + uint64_t PoolSize); + // Setup the RPC server for this device if needed. This may not run on some // plugins like the CPU targets. By default, it will not be executed so it is // up to the target to override this using the shouldSetupRPCServer function. @@ -831,6 +835,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } + /// Indicate whether the device should setup the global device memory pool. If + /// false is return the value on the device will be uninitialized. + virtual bool shouldSetupDeviceMemoryPool() const { return true; } + /// Indicate whether or not the device should setup the RPC server. This is /// only necessary for unhosted targets like the GPU. virtual bool shouldSetupRPCServer() const { return false; } @@ -911,6 +919,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Return the kernel environment object for kernel \p Name. Expected getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image); + + DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; + DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index 1bb8dc1064d86..431e34ca75cd6 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -843,6 +843,11 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + virtual bool shouldSetupDeviceMemoryPool() const override { + /// We use the CUDA malloc for now. + return false; + } + /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp index 8f2022bac9d8a..619f4dfed9b4e 100644 --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -307,8 +307,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::success(); } - /// This plugin should not setup the device environment. + /// This plugin should not setup the device environment or memory pool. virtual bool shouldSetupDeviceEnvironment() const override { return false; }; + virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; /// Getters and setters for stack size and heap size not relevant. Error getDeviceStackSize(uint64_t &Value) override { diff --git a/openmp/libomptarget/test/mapping/lambda_mapping.cpp b/openmp/libomptarget/test/mapping/lambda_mapping.cpp index acec6c82288a8..63b1719fbbc36 100644 --- a/openmp/libomptarget/test/mapping/lambda_mapping.cpp +++ b/openmp/libomptarget/test/mapping/lambda_mapping.cpp @@ -1,7 +1,7 @@ -// On AMDGPU we don't have malloc support yet. We need optimizations -// to avoid a thread state which requires malloc. -// -// XUN: %libomptarget-compilexx-run-and-check-generic +// Unonptimized, we need 24000000 bytes heap +// RUN: %libomptarget-compilexx-generic +// RUN: env LIBOMPTARGET_HEAP_SIZE=24000000 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic // RUN: %libomptarget-compileoptxx-run-and-check-generic #include diff --git a/openmp/libomptarget/test/offloading/malloc.c b/openmp/libomptarget/test/offloading/malloc.c new file mode 100644 index 0000000000000..ad49ace200625 --- /dev/null +++ b/openmp/libomptarget/test/offloading/malloc.c @@ -0,0 +1,37 @@ +// RUN: %libomptarget-compile-generic && %libomptarget-run-generic +// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic + +#include +#include + +int main() { + long unsigned *DP = 0; + int N = 128; + int Threads = 128; + int Teams = 440; + + // Allocate ~55MB on the device. +#pragma omp target map(from : DP) + DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams); + +#pragma omp target teams distribute parallel for num_teams(Teams) \ + thread_limit(Threads) is_device_ptr(DP) + for (int i = 0; i < Threads * Teams; ++i) { + for (int j = 0; j < N; ++j) { + DP[i * N + j] = i + j; + } + } + + long unsigned s = 0; +#pragma omp target teams distribute parallel for num_teams(Teams) \ + thread_limit(Threads) reduction(+ : s) + for (int i = 0; i < Threads * Teams; ++i) { + for (int j = 0; j < N; ++j) { + s += DP[i * N + j]; + } + } + + // CHECK: Sum: 203458478080 + printf("Sum: %li\n", s); + return 0; +} diff --git a/openmp/libomptarget/test/offloading/malloc_parallel.c b/openmp/libomptarget/test/offloading/malloc_parallel.c new file mode 100644 index 0000000000000..b8e975ca55a8f --- /dev/null +++ b/openmp/libomptarget/test/offloading/malloc_parallel.c @@ -0,0 +1,42 @@ +// RUN: %libomptarget-compile-generic && %libomptarget-run-generic +// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic + +#include +#include +#include + +int main() { + long unsigned **DP = 0; + int N = 128; + int Threads = 128; + int Teams = 440; + +#pragma omp target map(from : DP) + DP = (long unsigned **)malloc(sizeof(long unsigned *) * Threads * Teams); + +#pragma omp target teams distribute parallel for num_teams(Teams) \ + thread_limit(Threads) + for (int i = 0; i < Threads * Teams; ++i) + DP[i] = (long unsigned *)malloc(sizeof(long unsigned) * N); + +#pragma omp target teams distribute parallel for num_teams(Teams) \ + thread_limit(Threads) + for (int i = 0; i < Threads * Teams; ++i) { + for (int j = 0; j < N; ++j) { + DP[i][j] = i + j; + } + } + + long unsigned s = 0; +#pragma omp target teams distribute parallel for num_teams(Teams) \ + thread_limit(Threads) reduction(+ : s) + for (int i = 0; i < Threads * Teams; ++i) { + for (int j = 0; j < N; ++j) { + s += DP[i][j]; + } + } + + // CHECK: Sum: 203458478080 + printf("Sum: %li\n", s); + return 0; +}