Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ option(UR_BUILD_ADAPTER_HIP "Build the HIP adapter" OFF)
option(UR_BUILD_ADAPTER_NATIVE_CPU "Build the Native-CPU adapter" OFF)
option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF)
option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
option(UR_BUILD_BENCHMARKS "Build UR benchmarks." OFF)
option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
Expand Down
10 changes: 10 additions & 0 deletions source/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/queue_api.hpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/common.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_factory.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/latency_tracker.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
Expand All @@ -139,6 +145,10 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.cpp
)

if(NOT WIN32)
Expand Down
124 changes: 0 additions & 124 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,130 +78,6 @@ preferCopyEngineForFill(ur_exp_command_buffer_handle_t CommandBuffer,
return UR_RESULT_SUCCESS;
}

/**
* Calculates a work group size for the kernel based on the GlobalWorkSize or
* the LocalWorkSize if provided.
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not
* provided.
* @param[in][optional] Device The device associated with the kernel. Used when
* LocalWorkSize is not provided.
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension.
* @param[out] WG The work group size for each dimension.
* @param[in] WorkDim The number of dimensions in the kernel.
* @param[in] GlobalWorkSize The global work size.
* @param[in][optional] LocalWorkSize The local work size.
* @return UR_RESULT_SUCCESS or an error code on failure.
*/
ur_result_t calculateKernelWorkDimensions(
ur_kernel_handle_t Kernel, ur_device_handle_t Device,
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
uint32_t WorkDim, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize) {

UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
// If LocalWorkSize is not provided then Kernel must be provided to query
// suggested group size.
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);

// New variable needed because GlobalWorkSize parameter might not be of size
// 3
size_t GlobalWorkSize3D[3]{1, 1, 1};
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);

if (LocalWorkSize) {
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
} else {
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize3D
// values do not fit to 32-bit that the API only supports currently.
bool SuggestGroupSize = true;
for (int I : {0, 1, 2}) {
if (GlobalWorkSize3D[I] > UINT32_MAX) {
SuggestGroupSize = false;
}
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(Kernel->ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
// Try to find a I-dimension WG size that the GlobalWorkSize3D[I] is
// fully divisable with. Start with the max possible size in
// each dimension.
uint32_t GroupSize[] = {
Device->ZeDeviceComputeProperties->maxGroupSizeX,
Device->ZeDeviceComputeProperties->maxGroupSizeY,
Device->ZeDeviceComputeProperties->maxGroupSizeZ};
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
while (GlobalWorkSize3D[I] % GroupSize[I]) {
--GroupSize[I];
}
if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
logger::debug("calculateKernelWorkDimensions: can't find a WG size "
"suitable for global work size > UINT32_MAX");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
WG[I] = GroupSize[I];
}
logger::debug("calculateKernelWorkDimensions: using computed WG "
"size = {{{}, {}, {}}}",
WG[0], WG[1], WG[2]);
}
}

// TODO: assert if sizes do not fit into 32-bit?
switch (WorkDim) {
case 3:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
ZeThreadGroupDimensions.groupCountZ =
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
break;
case 2:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
WG[2] = 1;
break;
case 1:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
WG[1] = WG[2] = 1;
break;

default:
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
return UR_RESULT_ERROR_INVALID_VALUE;
}

// Error handling for non-uniform group size case
if (GlobalWorkSize3D[0] !=
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 1st dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[1] !=
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 2nd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[2] !=
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 3rd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

return UR_RESULT_SUCCESS;
}

/**
* Helper function for finding the Level Zero events associated with the
* commands in a command-buffer, each event is pointed to by a sync-point in the
Expand Down
2 changes: 2 additions & 0 deletions source/adapters/level_zero/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

#include <umf_pools/disjoint_pool_config_parser.hpp>

#include "logger/ur_logger.hpp"

struct _ur_platform_handle_t;

static auto getUrResultString = [](ur_result_t Result) {
Expand Down
2 changes: 2 additions & 0 deletions source/adapters/level_zero/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ struct ur_context_handle_t_ : _ur_object {

ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}

virtual ~ur_context_handle_t_() {}

// A L0 context handle is primarily used during creation and management of
// resources that may be used by multiple devices.
// This field is only set at ur_context_handle_t creation time, and cannot
Expand Down
13 changes: 13 additions & 0 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
#include "logger/ur_logger.hpp"
#include "ur_level_zero.hpp"

#include "v2/event.hpp"
#include "v2/queue_factory.hpp"

void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) {
if (UrL0Debug & UR_L0_DEBUG_BASIC) {
std::stringstream ss;
Expand Down Expand Up @@ -862,6 +865,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(
ur_event_handle_t Event ///< [in] handle of the event object
) {
// TODO: make proper abstraction
if (v2::shouldUseQueueV2(nullptr, 0)) {
return reinterpret_cast<v2::ur_event_handle_t>(Event)->retain();
}

Event->RefCountExternal++;
Event->RefCount.increment();

Expand All @@ -871,6 +879,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(
UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(
ur_event_handle_t Event ///< [in] handle of the event object
) {
// TODO: make proper abstraction
if (v2::shouldUseQueueV2(nullptr, 0)) {
return reinterpret_cast<v2::ur_event_handle_t>(Event)->release();
}

Event->RefCountExternal--;
UR_CALL(urEventReleaseInternal(Event));

Expand Down
137 changes: 130 additions & 7 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,132 @@
#include "ur_api.h"
#include "ur_level_zero.hpp"

ur_result_t calculateKernelWorkDimensions(
ur_kernel_handle_t Kernel, ur_device_handle_t Device,
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
uint32_t WorkDim, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize) {

UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
// If LocalWorkSize is not provided then Kernel must be provided to query
// suggested group size.
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);

// New variable needed because GlobalWorkSize parameter might not be of size
// 3
size_t GlobalWorkSize3D[3]{1, 1, 1};
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);

if (LocalWorkSize) {
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
} else {
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize3D
// values do not fit to 32-bit that the API only supports currently.
bool SuggestGroupSize = true;
for (int I : {0, 1, 2}) {
if (GlobalWorkSize3D[I] > UINT32_MAX) {
SuggestGroupSize = false;
}
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(Kernel->ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
// Try to find a I-dimension WG size that the GlobalWorkSize3D[I] is
// fully divisable with. Start with the max possible size in
// each dimension.
uint32_t GroupSize[] = {
Device->ZeDeviceComputeProperties->maxGroupSizeX,
Device->ZeDeviceComputeProperties->maxGroupSizeY,
Device->ZeDeviceComputeProperties->maxGroupSizeZ};
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
while (GlobalWorkSize3D[I] % GroupSize[I]) {
--GroupSize[I];
}
if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
logger::debug("calculateKernelWorkDimensions: can't find a WG size "
"suitable for global work size > UINT32_MAX");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
WG[I] = GroupSize[I];
}
logger::debug("calculateKernelWorkDimensions: using computed WG "
"size = {{{}, {}, {}}}",
WG[0], WG[1], WG[2]);
}
}

// TODO: assert if sizes do not fit into 32-bit?
switch (WorkDim) {
case 3:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
ZeThreadGroupDimensions.groupCountZ =
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
break;
case 2:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
WG[2] = 1;
break;
case 1:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
WG[1] = WG[2] = 1;
break;

default:
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
return UR_RESULT_ERROR_INVALID_VALUE;
}

// Error handling for non-uniform group size case
if (GlobalWorkSize3D[0] !=
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 1st dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[1] !=
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 2nd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[2] !=
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 3rd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

return UR_RESULT_SUCCESS;
}

ur_result_t setKernelGlobalOffset(ur_context_handle_t Context,
ur_kernel_handle_t Kernel,
const size_t *GlobalWorkOffset) {

if (!Context->getPlatform()->ZeDriverGlobalOffsetExtensionFound) {
logger::debug("No global offset extension found on this driver");
return UR_RESULT_ERROR_INVALID_VALUE;
}

ZE2UR_CALL(zeKernelSetGlobalOffsetExp,
(Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
GlobalWorkOffset[2]));

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
[[maybe_unused]] const size_t *pGlobalWorkOffset,
Expand All @@ -27,7 +153,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D);

ze_kernel_handle_t ZeKernel{};
UR_CALL(getZeKernel(Legacy(hQueue), hKernel, &ZeKernel));
UR_CALL(getZeKernel(Legacy(hQueue)->Device->ZeDevice, hKernel, &ZeKernel));

UR_CALL(getSuggestedLocalWorkSize(Legacy(hQueue), ZeKernel, GlobalWorkSize3D,
LocalWorkSize));
Expand All @@ -36,15 +162,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
return UR_RESULT_SUCCESS;
}

ur_result_t getZeKernel(ur_queue_handle_legacy_t hQueue,
ur_kernel_handle_t hKernel,
ur_result_t getZeKernel(ze_device_handle_t hDevice, ur_kernel_handle_t hKernel,
ze_kernel_handle_t *phZeKernel) {
auto ZeDevice = hQueue->Device->ZeDevice;

if (hKernel->ZeKernelMap.empty()) {
*phZeKernel = hKernel->ZeKernel;
} else {
auto It = hKernel->ZeKernelMap.find(ZeDevice);
auto It = hKernel->ZeKernelMap.find(hDevice);
if (It == hKernel->ZeKernelMap.end()) {
/* kernel and queue don't match */
return UR_RESULT_ERROR_INVALID_QUEUE;
Expand Down Expand Up @@ -135,7 +258,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch(

auto Queue = this;
ze_kernel_handle_t ZeKernel{};
UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel));
UR_CALL(getZeKernel(Queue->Device->ZeDevice, Kernel, &ZeKernel));

// Lock automatically releases when this goes out of scope.
std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
Expand Down
Loading