-
Notifications
You must be signed in to change notification settings - Fork 971
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
412 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
/******************************************************************************* | ||
* Copyright 2022-2023 Intel Corporation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*******************************************************************************/ | ||
|
||
#include "gpu/sycl/ref_reorder.hpp" | ||
#include "gpu/sycl/reorder_kernels.hpp" | ||
|
||
namespace dnnl { | ||
namespace impl { | ||
namespace gpu { | ||
namespace sycl { | ||
|
||
using namespace impl::sycl; | ||
|
||
status_t ref_reorder_t::pd_t::init_conf() { | ||
conf_ = sycl_reorder_conf_t(); | ||
|
||
conf_.src_md = xpu::sycl::md_t(src_md(0)); | ||
conf_.dst_md = xpu::sycl::md_t(dst_md()); | ||
|
||
// XXX: should probably be tuned. | ||
conf_.block_size = 16; | ||
conf_.wg_size = 32; | ||
|
||
conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); | ||
|
||
conf_.do_scale_src | ||
= !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values(); | ||
conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_; | ||
conf_.do_scale_dst | ||
= !attr()->scales_.get(DNNL_ARG_DST).has_default_values(); | ||
conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_; | ||
conf_.post_ops = sycl_post_ops_t(attr()); | ||
|
||
return status::success; | ||
} | ||
|
||
status_t ref_reorder_t::init(engine_t *engine) { | ||
const auto kid = ::sycl::get_kernel_id<reorder_kernel_t>(); | ||
CHECK(create_kernel(engine, kid, &kernel_)); | ||
return status::success; | ||
} | ||
|
||
status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const { | ||
parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { | ||
auto src_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0); | ||
auto src_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY( | ||
DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0); | ||
auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST); | ||
auto dst_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY( | ||
DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST); | ||
|
||
auto scales_src_dt = (pd()->conf_.do_scale_src) | ||
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0) | ||
.data_type() | ||
: data_type_t::dnnl_f32; | ||
auto scales_dst_dt = (pd()->conf_.do_scale_dst) | ||
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST) | ||
.data_type() | ||
: data_type_t::dnnl_f32; | ||
|
||
reorder_kernel_t reorder_kernel(pd()->conf_, src_mem_arg, dst_mem_arg, | ||
src_scale_mem_arg, dst_scale_mem_arg, scales_src_dt, | ||
scales_dst_dt); | ||
|
||
const int block_size = pd()->conf_.block_size; | ||
const int wg_size = pd()->conf_.wg_size; | ||
|
||
const int t_work = pd()->conf_.wk_size; | ||
const int wg_work = wg_size * block_size; | ||
const int wg_cnt = utils::div_up(t_work, wg_work); | ||
|
||
cgh.parallel_for( | ||
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel); | ||
}); | ||
|
||
return status::success; | ||
} | ||
|
||
} // namespace sycl | ||
} // namespace gpu | ||
} // namespace impl | ||
} // namespace dnnl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/******************************************************************************* | ||
* Copyright 2022-2024 Intel Corporation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*******************************************************************************/ | ||
|
||
#ifndef GPU_SYCL_REF_REORDER_HPP | ||
#define GPU_SYCL_REF_REORDER_HPP | ||
|
||
#include "gpu/gpu_reorder_pd.hpp" | ||
#include "gpu/sycl/sycl_gpu_primitive.hpp" | ||
#include "gpu/sycl/sycl_io_helper.hpp" | ||
#include "gpu/sycl/sycl_post_ops.hpp" | ||
#include "gpu/sycl/sycl_primitive_conf.hpp" | ||
#include "gpu/sycl/sycl_q10n.hpp" | ||
#include "sycl/sycl_stream.hpp" | ||
|
||
namespace dnnl { | ||
namespace impl { | ||
namespace gpu { | ||
namespace sycl { | ||
|
||
struct ref_reorder_t : public sycl_gpu_primitive_t { | ||
using sycl_gpu_primitive_t::sycl_gpu_primitive_t; | ||
|
||
struct pd_t : public gpu_reorder_pd_t { | ||
using gpu_reorder_pd_t::gpu_reorder_pd_t; | ||
|
||
DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reorder_t); | ||
|
||
status_t init( | ||
engine_t *engine, engine_t *src_engine, engine_t *dst_engine) { | ||
using namespace data_type; | ||
using sm = primitive_attr_t::skip_mask_t; | ||
|
||
const memory_desc_wrapper src_d(src_md(0)); | ||
const memory_desc_wrapper dst_d(dst_md()); | ||
|
||
const bool ok = check_data_types(src_d, dst_d) | ||
&& check_formats(src_d, dst_d) | ||
&& attr()->has_default_values( | ||
sm::scales_runtime | sm::post_ops) | ||
&& (src_md(0)->format_desc.blocking.inner_nblks == 0) | ||
&& (dst_md()->format_desc.blocking.inner_nblks == 0) | ||
&& post_ops_ok(); | ||
if (!ok) return status::unimplemented; | ||
|
||
return init_conf(); | ||
} | ||
|
||
sycl_reorder_conf_t conf_; | ||
|
||
private: | ||
DECLARE_GPU_REORDER_CREATE(); | ||
|
||
status_t init_conf(); | ||
|
||
bool post_ops_ok() const { | ||
for (int i = 0; i < attr()->post_ops_.len(); i++) { | ||
if (!attr()->post_ops_.entry_[i].is_sum()) { return false; } | ||
} | ||
return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops | ||
&& attr()->post_ops_.has_default_values( | ||
{primitive_kind::sum}); | ||
} | ||
|
||
static bool check_data_types(const memory_desc_wrapper &src, | ||
const memory_desc_wrapper &dst) { | ||
using namespace data_type; | ||
|
||
const auto src_dt = src.data_type(); | ||
const auto dst_dt = dst.data_type(); | ||
|
||
for (auto t : {src_dt, dst_dt}) { | ||
if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
static bool check_formats(const memory_desc_wrapper &src, | ||
const memory_desc_wrapper &dst) { | ||
using namespace format_tag; | ||
|
||
for (const auto &mdw : {src, dst}) { | ||
if (!mdw.is_plain()) { return false; } | ||
} | ||
return true; | ||
} | ||
}; | ||
|
||
status_t init(engine_t *engine) override; | ||
status_t execute(const exec_ctx_t &ctx) const override; | ||
|
||
private: | ||
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } | ||
intel::compute::kernel_t kernel_; | ||
}; | ||
|
||
} // namespace sycl | ||
} // namespace gpu | ||
} // namespace impl | ||
} // namespace dnnl | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
/******************************************************************************* | ||
* Copyright 2022-2023 Intel Corporation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*******************************************************************************/ | ||
|
||
#ifndef GPU_SYCL_REORDER_KERNELS_HPP | ||
#define GPU_SYCL_REORDER_KERNELS_HPP | ||
|
||
#include "gpu/sycl/sycl_io_helper.hpp" | ||
#include "gpu/sycl/sycl_post_ops.hpp" | ||
#include "gpu/sycl/sycl_primitive_conf.hpp" | ||
#include "gpu/sycl/sycl_q10n.hpp" | ||
|
||
namespace dnnl { | ||
namespace impl { | ||
namespace gpu { | ||
namespace sycl { | ||
|
||
struct reorder_kernel_t { | ||
static constexpr int vec_len = 8; | ||
static constexpr int max_supported_ndims = 6; | ||
|
||
reorder_kernel_t(const sycl_reorder_conf_t &conf, | ||
xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, | ||
xpu::sycl::in_memory_arg_t &src_scale, | ||
xpu::sycl::in_memory_arg_t &dst_scale, data_type_t scales_src_dt, | ||
data_type_t scales_dst_dt) | ||
: conf_(conf) | ||
, src_(src) | ||
, dst_(dst) | ||
, src_scale_(src_scale) | ||
, dst_scale_(dst_scale) | ||
, scales_src_dt_(scales_src_dt) | ||
, scales_dst_dt_(scales_dst_dt) {} | ||
|
||
void operator()(::sycl::nd_item<1> item) const { | ||
auto sg = item.get_sub_group(); | ||
size_t wg_offset_t = item.get_group(0) * conf_.wg_size; | ||
size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0]; | ||
size_t wi_offset_t = sg.get_local_id(); | ||
size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t; | ||
|
||
size_t base_idx = offset_t * conf_.block_size; | ||
|
||
float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0 | ||
? load_float_value(scales_src_dt_, src_scale_ptr(), 0) | ||
: 1.f; | ||
float scale_dst = conf_.do_scale_dst && conf_.scale_dst_mask == 0 | ||
? load_float_value(scales_dst_dt_, dst_scale_ptr(), 0) | ||
: 1.f; | ||
|
||
dims_t dims, off, strides; | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
dims[i] = (i < src_md().ndims()) ? src_md().dims()[i] : 1; | ||
strides[i] | ||
= (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX; | ||
} | ||
dims_t dims_scales_src; | ||
if (conf_.scale_src_mask != 0) { | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
dims_scales_src[i] | ||
= conf_.scale_src_mask >> i & 1 ? dims[i] : 1; | ||
} | ||
} | ||
dims_t dims_scales_dst; | ||
if (conf_.scale_dst_mask != 0) { | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
dims_scales_dst[i] | ||
= conf_.scale_dst_mask >> i & 1 ? dims[i] : 1; | ||
} | ||
} | ||
|
||
for (int i = 0; i < conf_.block_size; i++) { | ||
int idx = base_idx + i; | ||
if (idx < conf_.wk_size) { | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
off[i] = idx / strides[i] % dims[i]; | ||
} | ||
|
||
int dst_idx = dst_md().off_v(off); | ||
auto src = load_float_value( | ||
src_md().data_type(), src_ptr(), idx); | ||
auto dst = load_float_value( | ||
dst_md().data_type(), dst_ptr(), dst_idx); | ||
|
||
if (conf_.do_scale_src) { | ||
if (conf_.scale_src_mask != 0) { | ||
int scale_idx = 0; | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
if (i < src_md().ndims()) { | ||
int off_scales_i = conf_.scale_src_mask >> i & 1 | ||
? off[i] | ||
: 0; | ||
scale_idx = scale_idx * dims_scales_src[i] | ||
+ off_scales_i; | ||
} | ||
} | ||
scale_src = load_float_value( | ||
scales_src_dt_, src_scale_ptr(), scale_idx); | ||
} | ||
src *= scale_src; | ||
} | ||
|
||
auto acc = src; | ||
acc = conf_.post_ops.apply(acc, dst); | ||
if (conf_.do_scale_dst) { | ||
if (conf_.scale_dst_mask != 0) { | ||
int scale_idx = 0; | ||
for (int i = 0; i < max_supported_ndims; i++) { | ||
if (i < src_md().ndims()) { | ||
int off_scales_i = conf_.scale_dst_mask >> i & 1 | ||
? off[i] | ||
: 0; | ||
scale_idx = scale_idx * dims_scales_dst[i] | ||
+ off_scales_i; | ||
} | ||
} | ||
|
||
scale_dst = load_float_value( | ||
scales_dst_dt_, dst_scale_ptr(), scale_idx); | ||
} | ||
acc /= scale_dst; | ||
} | ||
store_float_value( | ||
dst_md().data_type(), acc, dst_ptr(), dst_idx); | ||
} | ||
} | ||
} | ||
|
||
private: | ||
const xpu::sycl::md_t &src_md() const { return conf_.src_md; } | ||
const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } | ||
|
||
void *src_ptr() const { return src_.get_pointer(); } | ||
void *dst_ptr() const { return dst_.get_pointer(); } | ||
float *src_scale_ptr() const { | ||
return static_cast<float *>(src_scale_.get_pointer()); | ||
} | ||
float *dst_scale_ptr() const { | ||
return static_cast<float *>(dst_scale_.get_pointer()); | ||
} | ||
|
||
sycl_reorder_conf_t conf_; | ||
|
||
xpu::sycl::in_memory_arg_t src_; | ||
xpu::sycl::out_memory_arg_t dst_; | ||
xpu::sycl::in_memory_arg_t src_scale_; | ||
xpu::sycl::in_memory_arg_t dst_scale_; | ||
data_type_t scales_src_dt_; | ||
data_type_t scales_dst_dt_; | ||
}; | ||
|
||
} // namespace sycl | ||
} // namespace gpu | ||
} // namespace impl | ||
} // namespace dnnl | ||
|
||
#endif |
Oops, something went wrong.