173 changes: 173 additions & 0 deletions openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "RPC.h"
#include "Debug.h"
#include "PluginInterface.h"

// This header file may be present in-tree or from an LLVM installation. The
// installed version lives alongside the GPU headers so we do not want to
// include it directly.
#if __has_include(<gpu-none-llvm/rpc_server.h>)
#include <gpu-none-llvm/rpc_server.h>
#elif defined(LIBOMPTARGET_RPC_SUPPORT)
#include <rpc_server.h>
#endif

using namespace llvm;
using namespace omp;
using namespace target;

RPCServerTy::RPCServerTy(uint32_t NumDevices) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
// If this fails then something is catastrophically wrong, just exit.
if (rpc_status_t Err = rpc_init(NumDevices))
FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err);
Handles.resize(NumDevices);
#endif
}

llvm::Expected<bool>
RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
void *ClientPtr;
plugin::GlobalTy Global(rpc_client_symbol_name, sizeof(void *), &ClientPtr);
if (auto Err = Handler.readGlobalFromImage(Device, Image, Global)) {
llvm::consumeError(std::move(Err));
return false;
}

return true;
#else
return false;
#endif
}

Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
uint32_t DeviceId = Device.getDeviceId();
auto Alloc = [](uint64_t Size, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
};
// TODO: Allow the device to declare its requested port count.
if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT,
Device.getWarpSize(), Alloc, &Device))
return plugin::Plugin::error(
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);

// Register a custom opcode handler to perform plugin specific allocation.
// FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Buffer->data[0] = reinterpret_cast<uintptr_t>(
Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
},
Data);
};
if (rpc_status_t Err =
rpc_register_callback(DeviceId, RPC_MALLOC, MallocHandler, &Device))
return plugin::Plugin::error(
"Failed to register RPC malloc handler for device %d: %d\n", DeviceId,
Err);

// Register a custom opcode handler to perform plugin specific deallocation.
auto FreeHandler = [](rpc_port_t Port, void *Data) {
rpc_recv(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
TARGET_ALLOC_DEVICE);
},
Data);
};
if (rpc_status_t Err =
rpc_register_callback(DeviceId, RPC_FREE, FreeHandler, &Device))
return plugin::Plugin::error(
"Failed to register RPC free handler for device %d: %d\n", DeviceId,
Err);

// Get the address of the RPC client from the device.
void *ClientPtr;
plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
if (auto Err =
Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
return Err;

if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(),
sizeof(void *), nullptr))
return Err;

const void *ClientBuffer = rpc_get_client_buffer(DeviceId);
if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
rpc_get_client_size(), nullptr))
return Err;

Handles[DeviceId] = std::make_unique<RPCHandleTy>(*this, Device);
#endif
return Error::success();
}

llvm::Expected<RPCHandleTy *>
RPCServerTy::getDevice(plugin::GenericDeviceTy &Device) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
uint32_t DeviceId = Device.getDeviceId();
if (!Handles[DeviceId] || !rpc_get_buffer(DeviceId) ||
!rpc_get_client_buffer(DeviceId))
return plugin::Plugin::error(
"Attempt to get an RPC device while not initialized");

return Handles[DeviceId].get();
#else
return plugin::Plugin::error(
"Attempt to get an RPC device while not available");
#endif
}

Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId()))
return plugin::Plugin::error(
"Error while running RPC server on device %d: %d", Device.getDeviceId(),
Err);
#endif
return Error::success();
}

Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
auto Dealloc = [](void *Ptr, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(Ptr, TARGET_ALLOC_HOST);
};
if (rpc_status_t Err =
rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device))
return plugin::Plugin::error(
"Failed to shut down RPC server for device %d: %d",
Device.getDeviceId(), Err);
#endif
return Error::success();
}

RPCServerTy::~RPCServerTy() {
#ifdef LIBOMPTARGET_RPC_SUPPORT
rpc_shutdown();
#endif
}
87 changes: 87 additions & 0 deletions openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides the interface to support remote procedure calls (RPC) from
// the GPU. This is required to implement host services like printf or malloc.
// The interface to the RPC server is provided by the 'libc' project in LLVM.
// For more information visit https://libc.llvm.org/gpu/.
//
//===----------------------------------------------------------------------===//

#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H

#include "llvm/Support/Error.h"

#include <stdint.h>

namespace llvm::omp::target {
namespace plugin {
struct GenericDeviceTy;
struct GenericGlobalHandlerTy;
class DeviceImageTy;
} // namespace plugin

/// A generic class implementing the interface between the RPC server provided
/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
/// these routines will perform no action.
struct RPCServerTy {
public:
/// A wrapper around a single instance of the RPC server for a given device.
/// This is provided to simplify ownership of the underlying device.
struct RPCHandleTy {
RPCHandleTy(RPCServerTy &Server, plugin::GenericDeviceTy &Device)
: Server(Server), Device(Device) {}

llvm::Error runServer() { return Server.runServer(Device); }

llvm::Error deinitDevice() { return Server.deinitDevice(Device); }

private:
RPCServerTy &Server;
plugin::GenericDeviceTy &Device;
};

RPCServerTy(uint32_t NumDevices);

/// Check if this device image is using an RPC server. This checks for the
/// precense of an externally visible symbol in the device image that will
/// be present whenever RPC code is called.
llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image);

/// Initialize the RPC server for the given device. This will allocate host
/// memory for the internal server and copy the data to the client on the
/// device. The device must be loaded before this is valid.
llvm::Error initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image);

/// Gets a reference to this server for a specific device.
llvm::Expected<RPCHandleTy *> getDevice(plugin::GenericDeviceTy &Device);

/// Runs the RPC server associated with the \p Device until the pending work
/// is cleared.
llvm::Error runServer(plugin::GenericDeviceTy &Device);

/// Deinitialize the RPC server for the given device. This will free the
/// memory associated with the k
llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);

~RPCServerTy();

private:
llvm::SmallVector<std::unique_ptr<RPCHandleTy>> Handles;
};

using RPCHandleTy = RPCServerTy::RPCHandleTy;

} // namespace llvm::omp::target

#endif
19 changes: 18 additions & 1 deletion openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
}

/// We want to set up the RPC server for host services to the GPU if it is
/// availible.
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}

/// Get the stream of the asynchronous info sructure or get a new one.
CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
CUstream &Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
Expand Down Expand Up @@ -464,7 +470,18 @@ struct CUDADeviceTy : public GenericDeviceTy {
/// Synchronize current thread with the pending operations on the async info.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
CUresult Res = cuStreamSynchronize(Stream);
CUresult Res;
// If we have an RPC server running on this device we will continuously
// query it for work rather than blocking.
if (!getRPCHandle()) {
Res = cuStreamSynchronize(Stream);
} else {
do {
Res = cuStreamQuery(Stream);
if (auto Err = getRPCHandle()->runServer())
return Err;
} while (Res == CUDA_ERROR_NOT_READY);
}

// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")
string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}")
foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS)
string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET)

add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET}
"Running libomptarget tests"
${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET}
Expand Down
33 changes: 33 additions & 0 deletions openmp/libomptarget/test/libc/malloc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// RUN: %libomptarget-compile-run-and-check-generic

// REQUIRES: libc

// TODO: This requires async malloc on CUDA which is an 11.2 feature.
// UNSUPPORTED: nvptx64-nvidia-cuda
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO

#include <stdio.h>
#include <stdlib.h>

#pragma omp declare target to(malloc)
#pragma omp declare target to(free)

int main() {
unsigned h_x;
unsigned *d_x;
#pragma omp target map(from : d_x)
{
d_x = malloc(sizeof(unsigned));
*d_x = 1;
}

#pragma omp target is_device_ptr(d_x) map(from : h_x)
{ h_x = *d_x; }

#pragma omp target is_device_ptr(d_x)
{ free(d_x); }

// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);
}
35 changes: 35 additions & 0 deletions openmp/libomptarget/test/libc/puts.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// RUN: %libomptarget-compile-run-and-check-generic

// REQUIRES: libc

#include <stdio.h>

#pragma omp declare target to(stdout)

int main() {
// CHECK: PASS
#pragma omp target
{ fputs("PASS\n", stdout); }

// CHECK: PASS
#pragma omp target nowait
{ fputs("PASS\n", stdout); }

// CHECK: PASS
#pragma omp target nowait
{ fputs("PASS\n", stdout); }

#pragma omp taskwait

// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
// CHECK: PASS
#pragma omp target teams num_teams(4)
#pragma omp parallel num_threads(2)
{ fputs("PASS\n", stdout); }
}
15 changes: 11 additions & 4 deletions openmp/libomptarget/test/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ if 'flang' in config.llvm_enabled_projects:
config.available_features.add('flang')
tools.append(ToolSubst('%flang', command=FindTool('flang-new'), unresolved='fatal'))

if config.libomptarget_has_libc:
config.available_features.add('libc')

# Determine whether the test system supports unified memory.
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
# For all other targets, we currently assume it is.
Expand Down Expand Up @@ -213,10 +216,12 @@ for libomptarget_target in config.libomptarget_all_targets:
"%libomptarget-run-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compilexx-" + \
libomptarget_target, \
"%clangxx-" + libomptarget_target + " %s -o %t"))
"%clangxx-" + libomptarget_target + " %s -o %t" \
" -lcgpu" if config.libomptarget_has_libc else ""))
config.substitutions.append(("%libomptarget-compile-" + \
libomptarget_target, \
"%clang-" + libomptarget_target + " %s -o %t"))
"%clang-" + libomptarget_target + " %s -o %t"
" -lcgpu" if config.libomptarget_has_libc else ""))
config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
libomptarget_target, \
"%libomptarget-compileoptxx-and-run-" + libomptarget_target + \
Expand All @@ -235,10 +240,12 @@ for libomptarget_target in config.libomptarget_all_targets:
"%libomptarget-run-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compileoptxx-" + \
libomptarget_target, \
"%clangxx-" + libomptarget_target + " -O3 %s -o %t"))
"%clangxx-" + libomptarget_target + " -O3 %s -o %t"
" -lcgpu" if config.libomptarget_has_libc else ""))
config.substitutions.append(("%libomptarget-compileopt-" + \
libomptarget_target, \
"%clang-" + libomptarget_target + " -O3 %s -o %t"))
"%clang-" + libomptarget_target + " -O3 %s -o %t"
" -lcgpu" if config.libomptarget_has_libc else ""))
config.substitutions.append(("%libomptarget-run-" + \
libomptarget_target, \
"%t"))
Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/test/lit.site.cfg.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
config.llvm_enabled_projects = "@LLVM_ENABLE_PROJECTS@".split(";")
config.libomptarget_has_libc = "@LIBOMPTARGET_GPU_LIBC_SUPPORT@"

import lit.llvm
lit.llvm.initialize(lit_config, config)
Expand Down