Skip to content

Commit

Permalink
gpu: sycl: add reorder primitive
Browse files Browse the repository at this point in the history
  • Loading branch information
t4c1 committed May 21, 2024
1 parent 6f806c4 commit 9e12842
Show file tree
Hide file tree
Showing 6 changed files with 412 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/gpu/gpu_reorder_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
#include "gpu/intel/ocl/cross_engine_reorder.hpp"
#include "gpu/nvidia/cudnn_reorder.hpp"
#include "gpu/sycl/ref_reorder.hpp"
#endif

#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
Expand All @@ -53,6 +54,7 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({
GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t) // slow but fits every use case
GPU_REORDER_INSTANCE_NVIDIA(intel::ocl::cross_engine_reorder_t::pd_t)
GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t)
GPU_REORDER_INSTANCE_NVIDIA(sycl::ref_reorder_t::pd_t)
GPU_REORDER_INSTANCE_AMD(intel::ocl::cross_engine_reorder_t::pd_t)
GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t)
nullptr,
Expand Down
95 changes: 95 additions & 0 deletions src/gpu/sycl/ref_reorder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/sycl/ref_reorder.hpp"
#include "gpu/sycl/reorder_kernels.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

using namespace impl::sycl;

status_t ref_reorder_t::pd_t::init_conf() {
conf_ = sycl_reorder_conf_t();

conf_.src_md = xpu::sycl::md_t(src_md(0));
conf_.dst_md = xpu::sycl::md_t(dst_md());

// XXX: should probably be tuned.
conf_.block_size = 16;
conf_.wg_size = 32;

conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems();

conf_.do_scale_src
= !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_;
conf_.do_scale_dst
= !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;
conf_.post_ops = sycl_post_ops_t(attr());

return status::success;
}

status_t ref_reorder_t::init(engine_t *engine) {
const auto kid = ::sycl::get_kernel_id<reorder_kernel_t>();
CHECK(create_kernel(engine, kid, &kernel_));
return status::success;
}

status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
auto src_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0);
auto src_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0);
auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
auto dst_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);

auto scales_src_dt = (pd()->conf_.do_scale_src)
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)
.data_type()
: data_type_t::dnnl_f32;
auto scales_dst_dt = (pd()->conf_.do_scale_dst)
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)
.data_type()
: data_type_t::dnnl_f32;

reorder_kernel_t reorder_kernel(pd()->conf_, src_mem_arg, dst_mem_arg,
src_scale_mem_arg, dst_scale_mem_arg, scales_src_dt,
scales_dst_dt);

const int block_size = pd()->conf_.block_size;
const int wg_size = pd()->conf_.wg_size;

const int t_work = pd()->conf_.wk_size;
const int wg_work = wg_size * block_size;
const int wg_cnt = utils::div_up(t_work, wg_work);

cgh.parallel_for(
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel);
});

return status::success;
}

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl
115 changes: 115 additions & 0 deletions src/gpu/sycl/ref_reorder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*******************************************************************************
* Copyright 2022-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REF_REORDER_HPP
#define GPU_SYCL_REF_REORDER_HPP

#include "gpu/gpu_reorder_pd.hpp"
#include "gpu/sycl/sycl_gpu_primitive.hpp"
#include "gpu/sycl/sycl_io_helper.hpp"
#include "gpu/sycl/sycl_post_ops.hpp"
#include "gpu/sycl/sycl_primitive_conf.hpp"
#include "gpu/sycl/sycl_q10n.hpp"
#include "sycl/sycl_stream.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

struct ref_reorder_t : public sycl_gpu_primitive_t {
using sycl_gpu_primitive_t::sycl_gpu_primitive_t;

struct pd_t : public gpu_reorder_pd_t {
using gpu_reorder_pd_t::gpu_reorder_pd_t;

DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reorder_t);

status_t init(
engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
using namespace data_type;
using sm = primitive_attr_t::skip_mask_t;

const memory_desc_wrapper src_d(src_md(0));
const memory_desc_wrapper dst_d(dst_md());

const bool ok = check_data_types(src_d, dst_d)
&& check_formats(src_d, dst_d)
&& attr()->has_default_values(
sm::scales_runtime | sm::post_ops)
&& (src_md(0)->format_desc.blocking.inner_nblks == 0)
&& (dst_md()->format_desc.blocking.inner_nblks == 0)
&& post_ops_ok();
if (!ok) return status::unimplemented;

return init_conf();
}

sycl_reorder_conf_t conf_;

private:
DECLARE_GPU_REORDER_CREATE();

status_t init_conf();

bool post_ops_ok() const {
for (int i = 0; i < attr()->post_ops_.len(); i++) {
if (!attr()->post_ops_.entry_[i].is_sum()) { return false; }
}
return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
&& attr()->post_ops_.has_default_values(
{primitive_kind::sum});
}

static bool check_data_types(const memory_desc_wrapper &src,
const memory_desc_wrapper &dst) {
using namespace data_type;

const auto src_dt = src.data_type();
const auto dst_dt = dst.data_type();

for (auto t : {src_dt, dst_dt}) {
if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false;
}

return true;
}

static bool check_formats(const memory_desc_wrapper &src,
const memory_desc_wrapper &dst) {
using namespace format_tag;

for (const auto &mdw : {src, dst}) {
if (!mdw.is_plain()) { return false; }
}
return true;
}
};

status_t init(engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
intel::compute::kernel_t kernel_;
};

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
169 changes: 169 additions & 0 deletions src/gpu/sycl/reorder_kernels.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REORDER_KERNELS_HPP
#define GPU_SYCL_REORDER_KERNELS_HPP

#include "gpu/sycl/sycl_io_helper.hpp"
#include "gpu/sycl/sycl_post_ops.hpp"
#include "gpu/sycl/sycl_primitive_conf.hpp"
#include "gpu/sycl/sycl_q10n.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

struct reorder_kernel_t {
static constexpr int vec_len = 8;
static constexpr int max_supported_ndims = 6;

reorder_kernel_t(const sycl_reorder_conf_t &conf,
xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst,
xpu::sycl::in_memory_arg_t &src_scale,
xpu::sycl::in_memory_arg_t &dst_scale, data_type_t scales_src_dt,
data_type_t scales_dst_dt)
: conf_(conf)
, src_(src)
, dst_(dst)
, src_scale_(src_scale)
, dst_scale_(dst_scale)
, scales_src_dt_(scales_src_dt)
, scales_dst_dt_(scales_dst_dt) {}

void operator()(::sycl::nd_item<1> item) const {
auto sg = item.get_sub_group();
size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
size_t wi_offset_t = sg.get_local_id();
size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;

size_t base_idx = offset_t * conf_.block_size;

float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0
? load_float_value(scales_src_dt_, src_scale_ptr(), 0)
: 1.f;
float scale_dst = conf_.do_scale_dst && conf_.scale_dst_mask == 0
? load_float_value(scales_dst_dt_, dst_scale_ptr(), 0)
: 1.f;

dims_t dims, off, strides;
for (int i = 0; i < max_supported_ndims; i++) {
dims[i] = (i < src_md().ndims()) ? src_md().dims()[i] : 1;
strides[i]
= (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX;
}
dims_t dims_scales_src;
if (conf_.scale_src_mask != 0) {
for (int i = 0; i < max_supported_ndims; i++) {
dims_scales_src[i]
= conf_.scale_src_mask >> i & 1 ? dims[i] : 1;
}
}
dims_t dims_scales_dst;
if (conf_.scale_dst_mask != 0) {
for (int i = 0; i < max_supported_ndims; i++) {
dims_scales_dst[i]
= conf_.scale_dst_mask >> i & 1 ? dims[i] : 1;
}
}

for (int i = 0; i < conf_.block_size; i++) {
int idx = base_idx + i;
if (idx < conf_.wk_size) {
for (int i = 0; i < max_supported_ndims; i++) {
off[i] = idx / strides[i] % dims[i];
}

int dst_idx = dst_md().off_v(off);
auto src = load_float_value(
src_md().data_type(), src_ptr(), idx);
auto dst = load_float_value(
dst_md().data_type(), dst_ptr(), dst_idx);

if (conf_.do_scale_src) {
if (conf_.scale_src_mask != 0) {
int scale_idx = 0;
for (int i = 0; i < max_supported_ndims; i++) {
if (i < src_md().ndims()) {
int off_scales_i = conf_.scale_src_mask >> i & 1
? off[i]
: 0;
scale_idx = scale_idx * dims_scales_src[i]
+ off_scales_i;
}
}
scale_src = load_float_value(
scales_src_dt_, src_scale_ptr(), scale_idx);
}
src *= scale_src;
}

auto acc = src;
acc = conf_.post_ops.apply(acc, dst);
if (conf_.do_scale_dst) {
if (conf_.scale_dst_mask != 0) {
int scale_idx = 0;
for (int i = 0; i < max_supported_ndims; i++) {
if (i < src_md().ndims()) {
int off_scales_i = conf_.scale_dst_mask >> i & 1
? off[i]
: 0;
scale_idx = scale_idx * dims_scales_dst[i]
+ off_scales_i;
}
}

scale_dst = load_float_value(
scales_dst_dt_, dst_scale_ptr(), scale_idx);
}
acc /= scale_dst;
}
store_float_value(
dst_md().data_type(), acc, dst_ptr(), dst_idx);
}
}
}

private:
const xpu::sycl::md_t &src_md() const { return conf_.src_md; }
const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; }

void *src_ptr() const { return src_.get_pointer(); }
void *dst_ptr() const { return dst_.get_pointer(); }
float *src_scale_ptr() const {
return static_cast<float *>(src_scale_.get_pointer());
}
float *dst_scale_ptr() const {
return static_cast<float *>(dst_scale_.get_pointer());
}

sycl_reorder_conf_t conf_;

xpu::sycl::in_memory_arg_t src_;
xpu::sycl::out_memory_arg_t dst_;
xpu::sycl::in_memory_arg_t src_scale_;
xpu::sycl::in_memory_arg_t dst_scale_;
data_type_t scales_src_dt_;
data_type_t scales_dst_dt_;
};

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
Loading

0 comments on commit 9e12842

Please sign in to comment.