Skip to content

Commit

Permalink
gpu: compute: implement native float atomics support check
Browse files Browse the repository at this point in the history
  • Loading branch information
kealan-barbieri authored and karturov committed Mar 27, 2024
1 parent 0b399ac commit ebe77b5
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 22 deletions.
11 changes: 11 additions & 0 deletions src/gpu/compute/device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,15 @@ bool device_info_t::mayiuse_sub_group(int size) const {
}
}

bool device_info_t::mayiuse_float_atomic_add(data_type_t type) const {
switch (type) {
case data_type::f16: return has_native(native_ext_t::fp16_atomic_add);
case data_type::f32: return has_native(native_ext_t::fp32_atomic_add);
case data_type::f64: return has_native(native_ext_t::fp64_atomic_add);
default: return false;
}
}

bool device_info_t::has_native(data_type_t type) const {
switch (type) {
case data_type::undef:
Expand Down Expand Up @@ -264,6 +273,7 @@ status_t device_info_t::init_serialized_device_info(
serialized_device_info_.write(&max_wg_size_);
serialized_device_info_.write(&llc_cache_size_);
serialized_device_info_.write(&extensions_);
serialized_device_info_.write(&native_extensions_);
serialized_device_info_.write(&mayiuse_systolic_);
serialized_device_info_.write(&mayiuse_ngen_kernels_);
serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
Expand Down Expand Up @@ -301,6 +311,7 @@ status_t device_info_t::init_from_cache_blob(
DESERIALIZE(max_wg_size_, size_t);
DESERIALIZE(llc_cache_size_, size_t);
DESERIALIZE(extensions_, uint64_t);
DESERIALIZE(native_extensions_, uint64_t);
DESERIALIZE(mayiuse_systolic_, bool);
DESERIALIZE(mayiuse_ngen_kernels_, bool);
DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
Expand Down
23 changes: 20 additions & 3 deletions src/gpu/compute/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,21 @@ static inline const char *ext2cl_str(device_ext_t ext) {
#undef CASE
}

enum class native_ext_t : uint64_t {
// clang-format off
// OpenCL data types
fp32_atomic_add = 1ull << 0,
fp32_atomic_min_max = 1ull << 1,
fp32_atomic_load_store = 1ull << 2,
fp16_atomic_add = 1ull << 3,
fp16_atomic_min_max = 1ull << 4,
fp16_atomic_load_store = 1ull << 5,
fp64_atomic_add = 1ull << 6,
fp64_atomic_min_max = 1ull << 7,
fp64_atomic_load_store = 1ull << 8,
last
};

struct runtime_version_t {
int major;
int minor;
Expand Down Expand Up @@ -221,6 +236,7 @@ struct device_info_t {
}

bool has(device_ext_t ext) const { return extensions_ & (uint64_t)ext; }
bool has_native(native_ext_t ext) const { return native_extensions_ & (uint64_t)ext; }
gpu_arch_t gpu_arch() const { return gpu_arch_; }
int stepping_id() const { return stepping_id_; }
int max_eus_per_wg() const { return max_eus_per_wg_; }
Expand Down Expand Up @@ -255,14 +271,14 @@ struct device_info_t {

bool mayiuse_systolic() const { return mayiuse_systolic_; }

bool is_xelpg() const { return is_xelpg_; }

bool mayiuse_non_uniform_work_groups() const {
return mayiuse_non_uniform_work_groups_;
}

bool mayiuse_sub_group(int size) const;

bool mayiuse_float_atomic_add(data_type_t type) const;

bool has_native(data_type_t type) const;

const std::vector<uint8_t> &get_cache_blob() const {
Expand Down Expand Up @@ -292,7 +308,6 @@ struct device_info_t {
int stepping_id_ = 0;
bool mayiuse_systolic_ = false;
bool mayiuse_ngen_kernels_ = false;
bool is_xelpg_ = false;

std::string name_;
runtime_version_t runtime_version_;
Expand All @@ -310,6 +325,8 @@ struct device_info_t {

// extensions_ and gpu_arch_ describe effective extensions and GPU architecture.
uint64_t extensions_ = 0;
// native extensions, may differ from support reported by runtime.
uint64_t native_extensions_ = 0;

private:
status_t init_attributes_common(engine_t *engine);
Expand Down
2 changes: 2 additions & 0 deletions src/gpu/compute/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <vector>

#include "common/utils.hpp"
#include "gpu/compute/device_info.hpp"
#include "gpu/utils.hpp"

namespace dnnl {
namespace impl {
Expand Down
6 changes: 3 additions & 3 deletions src/gpu/jit/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class ir_to_ngen_t : public ir_visitor_t {
, expr_binding_(expr_binding)
, simd_size_(host->getSIMD())
, eu_count_(host->exec_cfg_.hw().eu_count())
, is_xelpg_(host->exec_cfg_.hw().is_xelpg()) {}
, with_atomic_fp64_(host->exec_cfg_.hw().has_fp64_atomic_support()) {}

~ir_to_ngen_t() {
#ifdef DNNL_DEV_MODE
Expand Down Expand Up @@ -677,7 +677,7 @@ class ir_to_ngen_t : public ir_visitor_t {
if ((hw <= ngen::HW::XeLP && send_func.is_atomic())
|| (hw == ngen::HW::XeHPG && send_func.is_atomic()
&& send_func.type.kind() == type_kind_t::qword
&& is_xelpg_)) {
&& !with_atomic_fp64_)) {
send_atomic_add_emu(scope, send_func, mask_op, mod, mem_buf_rd,
surf_bti, mem_off_op.reg_data(), rd);
} else {
Expand Down Expand Up @@ -767,7 +767,7 @@ class ir_to_ngen_t : public ir_visitor_t {
expr_binding_t expr_binding_;
int simd_size_;
int eu_count_;
bool is_xelpg_;
bool with_atomic_fp64_;

#ifdef DNNL_DEV_MODE
int bank_conflicts_ = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/jit/conv/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ bool data_types_ok(const conv_problem_t &prb, const hw_t &hw) {
return false;
if (prb.is_f64_conv()
&& (utils::one_of(hw.to_ngen(), ngen::HW::XeLP, ngen::HW::XeHPG)
&& !hw.is_xelpg()))
&& !hw.has_fp64_atomic_support()))
return false;
if (is_bf8
&& !(utils::one_of(hw, ngen::HW::XeHPC) && hw.systolic_support()))
Expand Down
4 changes: 2 additions & 2 deletions src/gpu/jit/conv/plan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2292,8 +2292,8 @@ class plan_builder_t {
// XeLPG. F64 fadd emulation is only reliable with vec_size 8 on XeLPG.
bool requires_fadd
= prb_.is_bwd_w && gemm_schedule_.with_kernel_grid_k_slicing();
if (cfg_.hw().is_xelpg() && requires_fadd && c_layout.elems() % 8 != 0
&& c_type == type_t::f64()) {
if (!cfg_.hw().has_fp64_atomic_support() && requires_fadd
&& c_layout.elems() % 8 != 0 && c_type == type_t::f64()) {
return plan_status_t::invalid_c_layout;
}

Expand Down
7 changes: 4 additions & 3 deletions src/gpu/jit/ir/hw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ class hw_t {
device_info->max_wg_size(/*large_grf_mode=*/false));
large_grf_support_ = compute_engine->mayiuse_large_grf_mode();
systolic_support_ = device_info->mayiuse_systolic();
is_xelpg_ = device_info->is_xelpg();
with_atomic_fp64_
= device_info->mayiuse_float_atomic_add(data_type::f64);

#ifdef DNNL_DEV_MODE
gpu_arch_t old_arch = gpu_arch;
Expand All @@ -99,7 +100,7 @@ class hw_t {
}

bool is_undef() const { return hw_ == ngen::HW::Unknown; }
bool is_xelpg() const { return is_xelpg_; }
bool has_fp64_atomic_support() const { return with_atomic_fp64_; }
ngen::HW to_ngen() const { return hw_; }
int stepping_id() const { return stepping_id_; }
int eu_count() const { return eu_count_; }
Expand Down Expand Up @@ -198,7 +199,7 @@ class hw_t {
int max_wg_size_ = 0;
bool large_grf_support_ = false;
bool systolic_support_ = false;
bool is_xelpg_ = false;
bool with_atomic_fp64_ = false;
};

inline hw_t str_to_hw(const std::string &s) {
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/ocl/ocl_gpu_device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ status_t ocl_gpu_device_info_t::init_arch(engine_t *engine) {
OCL_CHECK(err);

init_gpu_hw_info(engine, device, context, gpu_arch_, stepping_id_,
mayiuse_systolic_, mayiuse_ngen_kernels_, is_xelpg_);
native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_);

err = clReleaseContext(context);
OCL_CHECK(err);
Expand Down
10 changes: 7 additions & 3 deletions src/gpu/ocl/ocl_gpu_hw_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ namespace gpu {
namespace ocl {

void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context,
compute::gpu_arch_t &gpu_arch, int &stepping_id, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels, bool &is_xelpg) {
compute::gpu_arch_t &gpu_arch, int &stepping_id,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels) {
using namespace ngen;
HW hw = HW::Unknown;
Product product = {ProductFamily::Unknown, 0};
jit::jit_generator<HW::Unknown>::detectHWInfo(context, device, hw, product);
is_xelpg = (product.family == ngen::ProductFamily::ARL
bool is_xelpg = (product.family == ngen::ProductFamily::ARL
|| product.family == ngen::ProductFamily::MTL);

gpu_arch = jit::convert_ngen_arch_to_dnnl(hw);
Expand All @@ -43,6 +44,9 @@ void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context,
status_t ret
= get_ocl_device_enabled_systolic_intel(device, mayiuse_systolic);
assert(ret == CL_SUCCESS);
ret = get_ocl_device_enabled_native_float_atomics(
device, native_extensions, is_xelpg);
assert(ret == CL_SUCCESS);
MAYBE_UNUSED(ret);

auto status
Expand Down
5 changes: 3 additions & 2 deletions src/gpu/ocl/ocl_gpu_hw_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ namespace gpu {
namespace ocl {

void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context,
compute::gpu_arch_t &gpu_arch, int &stepping_id, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels, bool &is_xelpg);
compute::gpu_arch_t &gpu_arch, int &stepping_id,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels);

} // namespace ocl
} // namespace gpu
Expand Down
79 changes: 79 additions & 0 deletions src/gpu/ocl/ocl_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,28 @@
#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL 0x4256
#endif

#ifndef CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT 0x4231
#endif

#ifndef CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT
#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT 0x4232
#endif

#ifndef CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT
#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT 0x4233
#endif

#ifndef CL_DEVICE_ATOMIC_FLAGS
#define CL_DEVICE_ATOMIC_FLAGS
#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 0)
#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT (1 << 1)
#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT (1 << 2)
#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 16)
#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT (1 << 17)
#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT (1 << 18)
#endif

#ifndef CL_DEVICE_FEATURE_FLAG_DPAS_INTEL
#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL (1 << 1)
#endif
Expand Down Expand Up @@ -479,6 +501,63 @@ status_t get_ocl_device_enabled_systolic_intel(
return status::success;
}

status_t get_ocl_device_enabled_native_float_atomics(
cl_device_id device, uint64_t &native_extensions, bool is_xelpg) {
cl_bitfield res;

OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT,
sizeof(cl_bitfield), &res, nullptr));
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp16_atomic_load_store;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp16_atomic_add;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp16_atomic_min_max;

OCL_CHECK(
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT,
sizeof(cl_bitfield), &res, nullptr));
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp32_atomic_load_store;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp32_atomic_add;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp32_atomic_min_max;

// XeLPG lacks native support for f64 atomics.
if (!is_xelpg) {
OCL_CHECK(clGetDeviceInfo(device,
CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT,
sizeof(cl_bitfield), &res, nullptr));
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
native_extensions |= (uint64_t)
gpu::compute::native_ext_t::fp64_atomic_load_store;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
native_extensions
|= (uint64_t)gpu::compute::native_ext_t::fp64_atomic_add;
if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
&& res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
native_extensions |= (uint64_t)
gpu::compute::native_ext_t::fp64_atomic_min_max;
}

return status::success;
}

status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count) {
// Try to use Intel-specific slices/sub-slices to deduce EU count.
auto status = get_ocl_device_eu_count_intel(device, eu_count);
Expand Down
3 changes: 3 additions & 0 deletions src/gpu/ocl/ocl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,9 @@ status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count);
status_t get_ocl_device_enabled_systolic_intel(
cl_device_id device, bool &systolic_enabled);

status_t get_ocl_device_enabled_native_float_atomics(
cl_device_id device, uint64_t &native_extensions, bool is_xelpg);

status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel);

status_t create_ocl_program(gpu::ocl::ocl_wrapper_t<cl_program> &ocl_program,
Expand Down
7 changes: 3 additions & 4 deletions src/sycl/sycl_device_info.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2023 Intel Corporation
* Copyright 2020-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,8 +51,8 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) {
OCL_CHECK(err);

gpu::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper, ocl_ctx_wrapper,
gpu_arch_, stepping_id_, mayiuse_systolic_,
mayiuse_ngen_kernels_, is_xelpg_);
gpu_arch_, stepping_id_, native_extensions_, mayiuse_systolic_,
mayiuse_ngen_kernels_);
} else if (be == backend_t::level0) {
// TODO: add support for L0 binary ngen check
// XXX: query from ocl_engine for now
Expand All @@ -71,7 +71,6 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) {
stepping_id_ = dev_info->stepping_id();
mayiuse_systolic_ = dev_info->mayiuse_systolic();
mayiuse_ngen_kernels_ = dev_info->mayiuse_ngen_kernels();
is_xelpg_ = dev_info->is_xelpg();
} else {
assert(!"not_expected");
}
Expand Down

0 comments on commit ebe77b5

Please sign in to comment.