gpu: sycl: add reorder primitive

oneapi-src · May 21, 2024 · 9e12842 · 9e12842
1 parent 6f806c4
commit 9e12842
Show file tree

Hide file tree

Showing 6 changed files with 412 additions and 0 deletions.
diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp
@@ -28,6 +28,7 @@
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/intel/ocl/cross_engine_reorder.hpp"
 #include "gpu/nvidia/cudnn_reorder.hpp"
+#include "gpu/sycl/ref_reorder.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
@@ -53,6 +54,7 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({
         GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t)    // slow but fits every use case
         GPU_REORDER_INSTANCE_NVIDIA(intel::ocl::cross_engine_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t)
+        GPU_REORDER_INSTANCE_NVIDIA(sycl::ref_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_AMD(intel::ocl::cross_engine_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t)
         nullptr,

diff --git a/src/gpu/sycl/ref_reorder.cpp b/src/gpu/sycl/ref_reorder.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/sycl/ref_reorder.hpp"
+#include "gpu/sycl/reorder_kernels.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+using namespace impl::sycl;
+
+status_t ref_reorder_t::pd_t::init_conf() {
+    conf_ = sycl_reorder_conf_t();
+
+    conf_.src_md = xpu::sycl::md_t(src_md(0));
+    conf_.dst_md = xpu::sycl::md_t(dst_md());
+
+    // XXX: should probably be tuned.
+    conf_.block_size = 16;
+    conf_.wg_size = 32;
+
+    conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems();
+
+    conf_.do_scale_src
+            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
+    conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_;
+    conf_.do_scale_dst
+            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
+    conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;
+    conf_.post_ops = sycl_post_ops_t(attr());
+
+    return status::success;
+}
+
+status_t ref_reorder_t::init(engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<reorder_kernel_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+    return status::success;
+}
+
+status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
+    parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        auto src_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0);
+        auto src_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
+                DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0);
+        auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+        auto dst_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
+                DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+
+        auto scales_src_dt = (pd()->conf_.do_scale_src)
+                ? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)
+                          .data_type()
+                : data_type_t::dnnl_f32;
+        auto scales_dst_dt = (pd()->conf_.do_scale_dst)
+                ? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)
+                          .data_type()
+                : data_type_t::dnnl_f32;
+
+        reorder_kernel_t reorder_kernel(pd()->conf_, src_mem_arg, dst_mem_arg,
+                src_scale_mem_arg, dst_scale_mem_arg, scales_src_dt,
+                scales_dst_dt);
+
+        const int block_size = pd()->conf_.block_size;
+        const int wg_size = pd()->conf_.wg_size;
+
+        const int t_work = pd()->conf_.wk_size;
+        const int wg_work = wg_size * block_size;
+        const int wg_cnt = utils::div_up(t_work, wg_work);
+
+        cgh.parallel_for(
+                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel);
+    });
+
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/sycl/ref_reorder.hpp b/src/gpu/sycl/ref_reorder.hpp
@@ -0,0 +1,115 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_SYCL_REF_REORDER_HPP
+#define GPU_SYCL_REF_REORDER_HPP
+
+#include "gpu/gpu_reorder_pd.hpp"
+#include "gpu/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/sycl/sycl_io_helper.hpp"
+#include "gpu/sycl/sycl_post_ops.hpp"
+#include "gpu/sycl/sycl_primitive_conf.hpp"
+#include "gpu/sycl/sycl_q10n.hpp"
+#include "sycl/sycl_stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+struct ref_reorder_t : public sycl_gpu_primitive_t {
+    using sycl_gpu_primitive_t::sycl_gpu_primitive_t;
+
+    struct pd_t : public gpu_reorder_pd_t {
+        using gpu_reorder_pd_t::gpu_reorder_pd_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reorder_t);
+
+        status_t init(
+                engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
+            using namespace data_type;
+            using sm = primitive_attr_t::skip_mask_t;
+
+            const memory_desc_wrapper src_d(src_md(0));
+            const memory_desc_wrapper dst_d(dst_md());
+
+            const bool ok = check_data_types(src_d, dst_d)
+                    && check_formats(src_d, dst_d)
+                    && attr()->has_default_values(
+                            sm::scales_runtime | sm::post_ops)
+                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
+                    && (dst_md()->format_desc.blocking.inner_nblks == 0)
+                    && post_ops_ok();
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_reorder_conf_t conf_;
+
+    private:
+        DECLARE_GPU_REORDER_CREATE();
+
+        status_t init_conf();
+
+        bool post_ops_ok() const {
+            for (int i = 0; i < attr()->post_ops_.len(); i++) {
+                if (!attr()->post_ops_.entry_[i].is_sum()) { return false; }
+            }
+            return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
+                    && attr()->post_ops_.has_default_values(
+                            {primitive_kind::sum});
+        }
+
+        static bool check_data_types(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &dst) {
+            using namespace data_type;
+
+            const auto src_dt = src.data_type();
+            const auto dst_dt = dst.data_type();
+
+            for (auto t : {src_dt, dst_dt}) {
+                if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false;
+            }
+
+            return true;
+        }
+
+        static bool check_formats(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &dst) {
+            using namespace format_tag;
+
+            for (const auto &mdw : {src, dst}) {
+                if (!mdw.is_plain()) { return false; }
+            }
+            return true;
+        }
+    };
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    intel::compute::kernel_t kernel_;
+};
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/sycl/reorder_kernels.hpp b/src/gpu/sycl/reorder_kernels.hpp
@@ -0,0 +1,169 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_SYCL_REORDER_KERNELS_HPP
+#define GPU_SYCL_REORDER_KERNELS_HPP
+
+#include "gpu/sycl/sycl_io_helper.hpp"
+#include "gpu/sycl/sycl_post_ops.hpp"
+#include "gpu/sycl/sycl_primitive_conf.hpp"
+#include "gpu/sycl/sycl_q10n.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+struct reorder_kernel_t {
+    static constexpr int vec_len = 8;
+    static constexpr int max_supported_ndims = 6;
+
+    reorder_kernel_t(const sycl_reorder_conf_t &conf,
+            xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst,
+            xpu::sycl::in_memory_arg_t &src_scale,
+            xpu::sycl::in_memory_arg_t &dst_scale, data_type_t scales_src_dt,
+            data_type_t scales_dst_dt)
+        : conf_(conf)
+        , src_(src)
+        , dst_(dst)
+        , src_scale_(src_scale)
+        , dst_scale_(dst_scale)
+        , scales_src_dt_(scales_src_dt)
+        , scales_dst_dt_(scales_dst_dt) {}
+
+    void operator()(::sycl::nd_item<1> item) const {
+        auto sg = item.get_sub_group();
+        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
+        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
+        size_t wi_offset_t = sg.get_local_id();
+        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
+
+        size_t base_idx = offset_t * conf_.block_size;
+
+        float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0
+                ? load_float_value(scales_src_dt_, src_scale_ptr(), 0)
+                : 1.f;
+        float scale_dst = conf_.do_scale_dst && conf_.scale_dst_mask == 0
+                ? load_float_value(scales_dst_dt_, dst_scale_ptr(), 0)
+                : 1.f;
+
+        dims_t dims, off, strides;
+        for (int i = 0; i < max_supported_ndims; i++) {
+            dims[i] = (i < src_md().ndims()) ? src_md().dims()[i] : 1;
+            strides[i]
+                    = (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX;
+        }
+        dims_t dims_scales_src;
+        if (conf_.scale_src_mask != 0) {
+            for (int i = 0; i < max_supported_ndims; i++) {
+                dims_scales_src[i]
+                        = conf_.scale_src_mask >> i & 1 ? dims[i] : 1;
+            }
+        }
+        dims_t dims_scales_dst;
+        if (conf_.scale_dst_mask != 0) {
+            for (int i = 0; i < max_supported_ndims; i++) {
+                dims_scales_dst[i]
+                        = conf_.scale_dst_mask >> i & 1 ? dims[i] : 1;
+            }
+        }
+
+        for (int i = 0; i < conf_.block_size; i++) {
+            int idx = base_idx + i;
+            if (idx < conf_.wk_size) {
+                for (int i = 0; i < max_supported_ndims; i++) {
+                    off[i] = idx / strides[i] % dims[i];
+                }
+
+                int dst_idx = dst_md().off_v(off);
+                auto src = load_float_value(
+                        src_md().data_type(), src_ptr(), idx);
+                auto dst = load_float_value(
+                        dst_md().data_type(), dst_ptr(), dst_idx);
+
+                if (conf_.do_scale_src) {
+                    if (conf_.scale_src_mask != 0) {
+                        int scale_idx = 0;
+                        for (int i = 0; i < max_supported_ndims; i++) {
+                            if (i < src_md().ndims()) {
+                                int off_scales_i = conf_.scale_src_mask >> i & 1
+                                        ? off[i]
+                                        : 0;
+                                scale_idx = scale_idx * dims_scales_src[i]
+                                        + off_scales_i;
+                            }
+                        }
+                        scale_src = load_float_value(
+                                scales_src_dt_, src_scale_ptr(), scale_idx);
+                    }
+                    src *= scale_src;
+                }
+
+                auto acc = src;
+                acc = conf_.post_ops.apply(acc, dst);
+                if (conf_.do_scale_dst) {
+                    if (conf_.scale_dst_mask != 0) {
+                        int scale_idx = 0;
+                        for (int i = 0; i < max_supported_ndims; i++) {
+                            if (i < src_md().ndims()) {
+                                int off_scales_i = conf_.scale_dst_mask >> i & 1
+                                        ? off[i]
+                                        : 0;
+                                scale_idx = scale_idx * dims_scales_dst[i]
+                                        + off_scales_i;
+                            }
+                        }
+
+                        scale_dst = load_float_value(
+                                scales_dst_dt_, dst_scale_ptr(), scale_idx);
+                    }
+                    acc /= scale_dst;
+                }
+                store_float_value(
+                        dst_md().data_type(), acc, dst_ptr(), dst_idx);
+            }
+        }
+    }
+
+private:
+    const xpu::sycl::md_t &src_md() const { return conf_.src_md; }
+    const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; }
+
+    void *src_ptr() const { return src_.get_pointer(); }
+    void *dst_ptr() const { return dst_.get_pointer(); }
+    float *src_scale_ptr() const {
+        return static_cast<float *>(src_scale_.get_pointer());
+    }
+    float *dst_scale_ptr() const {
+        return static_cast<float *>(dst_scale_.get_pointer());
+    }
+
+    sycl_reorder_conf_t conf_;
+
+    xpu::sycl::in_memory_arg_t src_;
+    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::in_memory_arg_t src_scale_;
+    xpu::sycl::in_memory_arg_t dst_scale_;
+    data_type_t scales_src_dt_;
+    data_type_t scales_dst_dt_;
+};
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif