diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/exception.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/exception.h
new file mode 100644
index 0000000000..ca8ac25258
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_cuda.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_cuda.h
new file mode 100644
index 0000000000..396729118e
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_cuda.h
@@ -0,0 +1,1051 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DPCT_HPP__
+static const char *_cudaGetErrorEnum(int error) {
+  /*
+  DPCT1009:0: SYCL uses exceptions to report errors and does not use the error
+  codes. The original code was commented out and a warning string was inserted.
+  You need to rewrite this code.
+  */
+  return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/;
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(int error) {
+  switch (error) {
+    case 0:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case 1:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case 3:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case 7:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case 8:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case 11:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case 13:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case 14:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case 15:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case 16:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(int error) {
+  switch (error) {
+    case 0:
+      return "CURAND_STATUS_SUCCESS";
+
+    case 100:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case 101:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case 102:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case 103:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case 104:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case 105:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case 106:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case 201:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case 202:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case 203:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case 204:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case 999:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+}
+
+#ifdef __DPCT_HPP__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  /*
+  DPCT1010:1: SYCL uses exceptions to report errors and does not use the error
+  codes. The call was replaced with 0. You need to rewrite this code.
+  */
+  int err = 0;
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  /*
+  DPCT1010:3: SYCL uses exceptions to report errors and does not use the error
+  codes. The call was replaced with 0. You need to rewrite this code.
+  */
+  int err = 0;
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct dpct_type_168589 {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {0x90, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct dpct_type_127073 {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {0x90, "Hopper"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __DPCT_HPP__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  /*
+  DPCT1003:5: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  /*
+  DPCT1035:6: All SYCL devices can be used by host to submit tasks. You may need
+  to adjust this code.
+  */
+  checkCudaErrors((computeMode = 1, 0));
+  checkCudaErrors(
+      (major = dpct::dev_mgr::instance().get_device(devID).get_major_version(),
+       0));
+  checkCudaErrors(
+      (minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(),
+       0));
+  /*
+  DPCT1035:7: All SYCL devices can be used by host to submit tasks. You may need
+  to adjust this code.
+  */
+  if (computeMode == 0) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  /*
+  DPCT1093:8: The "devID" may not be the best XPU device. Adjust the selected
+  device if needed.
+  */
+  /*
+  DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((dpct::select_device(devID), 0));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() try {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  /*
+  DPCT1003:10: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    /*
+    DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    checkCudaErrors((computeMode = 1, 0));
+    checkCudaErrors((major = dpct::dev_mgr::instance()
+                                 .get_device(current_device)
+                                 .get_major_version(),
+                     0));
+    checkCudaErrors((minor = dpct::dev_mgr::instance()
+                                 .get_device(current_device)
+                                 .get_minor_version(),
+                     0));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    /*
+    DPCT1035:12: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    if (computeMode != 0) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors((multiProcessorCount = dpct::dev_mgr::instance()
+                                                 .get_device(current_device)
+                                                 .get_max_compute_units(),
+                       0));
+      int result = (clockRate = dpct::dev_mgr::instance()
+                                    .get_device(current_device)
+                                    .get_max_clock_frequency(),
+                    0);
+
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    /*
+    DPCT1093:13: The "devID" may not be the best XPU device. Adjust the selected
+    device if needed.
+    */
+    /*
+    DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((dpct::select_device(devID), 0));
+    int major = 0, minor = 0;
+    checkCudaErrors((
+        major = dpct::dev_mgr::instance().get_device(devID).get_major_version(),
+        0));
+    checkCudaErrors((
+        minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(),
+        0));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  /*
+  DPCT1003:15: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    /*
+    DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    checkCudaErrors((computeMode = 1, 0));
+    checkCudaErrors((integrated = dpct::dev_mgr::instance()
+                                      .get_device(current_device)
+                                      .get_integrated(),
+                     0));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    /*
+    DPCT1035:17: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    if (integrated && (computeMode != 0)) {
+      /*
+      DPCT1093:18: The "current_device" may not be the best XPU device. Adjust
+      the selected device if needed.
+      */
+      /*
+      DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted.
+      You may need to rewrite this code.
+      */
+      checkCudaErrors((dpct::select_device(current_device), 0));
+
+      int major = 0, minor = 0;
+      checkCudaErrors((major = dpct::dev_mgr::instance()
+                                   .get_device(current_device)
+                                   .get_major_version(),
+                       0));
+      checkCudaErrors((minor = dpct::dev_mgr::instance()
+                                   .get_device(current_device)
+                                   .get_minor_version(),
+                       0));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id());
+  checkCudaErrors(
+      (major = dpct::dev_mgr::instance().get_device(dev).get_major_version(),
+       0));
+  checkCudaErrors(
+      (minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(),
+       0));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_functions.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_functions.h
new file mode 100644
index 0000000000..2975ddba6a
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_image.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_image.h
new file mode 100644
index 0000000000..d093d16a0e
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_image.h
@@ -0,0 +1,1001 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include "helper_string.h"
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float &val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+  FILE *fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#') {
+      continue;
+    }
+
+    if (i == 0) {
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    } else if (i == 1) {
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    } else if (i == 2) {
+      i += SSCANF(header, "%u", &maxval);
+    }
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height) {
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+    }
+  } else {
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                    *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+      0) {
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+  }
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+  unsigned char *idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels)) {
+    return false;
+  }
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data) {
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+  }
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+    fh << data[i];
+  }
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char *ndata =
+      (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char *ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE *fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE *fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+    fh << data[i] << ' ';
+  }
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp) {
+      error_count++;
+    }
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count) {
+      printf("total # of errors = %d\n", error_count);
+    }
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE *fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf(
+          "compareBin2Bin <unsigned int>"
+          " unable to open ref_file: %s\n",
+          ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf(
+          "> compareBin2Bin <unsigned int> nelements=%d,"
+          " epsilon=%4.2f, threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+      printf(
+          "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+          " threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_string.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_string.h
new file mode 100644
index 0000000000..47fb1ac1fa
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_timer.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_timer.h
new file mode 100644
index 0000000000..8ebce43598
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/Common/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/src/matrixMulCUBLAS.cpp.dp.cpp b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/src/matrixMulCUBLAS.cpp.dp.cpp
new file mode 100644
index 0000000000..777f10711e
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/01_sycl_dpct_output/src/matrixMulCUBLAS.cpp.dp.cpp
@@ -0,0 +1,489 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+* Matrix multiplication: C = A * B.
+* Host code.
+*
+* This sample implements matrix multiplication as described in Chapter 3
+* of the programming guide and uses the CUBLAS library to demonstrate
+* the best performance.
+* SOME PRECAUTIONS:
+* IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
+* WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
+* The reason is explained as follows:
+* CUBLAS library uses column-major storage, but C/C++ use row-major storage.
+* When passing the matrix pointer to CUBLAS, the memory layout alters from
+* row-major to column-major, which is equivalent to an implicit transpose.
+* In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
+* C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
+* implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
+* If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
+* multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
+* is a column-based cublas matrix, which means C(T) in C/C++, we need extra
+* transpose code to convert it to a row-based C/C++ matrix.
+* To solve the problem, let's consider our desired result C, a row-major matrix.
+* In cublas format, it is C(T) actually (because of the implicit transpose).
+* C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
+* happen to be C/C++ matrice B and A (still because of the implicit transpose)!
+* We don't need extra transpose code, we only need alter the input order!
+*
+* CUBLAS provides high-performance matrix multiplication.
+* See also:
+* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
+* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
+* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
+*/
+
+// Utilities and system includes
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <assert.h>
+#include <helper_string.h>
+#include <oneapi/mkl.hpp>
+#include <dpct/blas_utils.hpp>
+  // helper for shared functions common to CUDA Samples
+
+// CUDA runtime
+
+// CUDA and CUBLAS functions
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#include <cmath>
+
+#include <chrono>
+
+#ifndef min
+#define min(a, b) ((a < b) ? a : b)
+#endif
+#ifndef max
+#define max(a, b) ((a > b) ? a : b)
+#endif
+
+// Optional Command-line multiplier for matrix sizes
+typedef struct _matrixSize {
+  unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
+} sMatrixSize;
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute reference data set matrix multiply on CPU
+//! C = A * B
+//! @param C          reference data, computed but preallocated
+//! @param A          matrix A as provided to device
+//! @param B          matrix B as provided to device
+//! @param hA         height of matrix A
+//! @param wB         width of matrix B
+////////////////////////////////////////////////////////////////////////////////
+void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA,
+                  unsigned int wA, unsigned int wB) {
+  for (unsigned int i = 0; i < hA; ++i)
+    for (unsigned int j = 0; j < wB; ++j) {
+      double sum = 0;
+
+      for (unsigned int k = 0; k < wA; ++k) {
+        double a = A[i * wA + k];
+        double b = B[k * wB + j];
+        sum += a * b;
+      }
+
+      C[i * wB + j] = (float)sum;
+    }
+}
+
+// Allocates a matrix with random float entries.
+void randomInit(float *data, int size) {
+  for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX;
+}
+
+void printDiff(float *data1, float *data2, int width, int height,
+               int iListLength, float fListTol) {
+  printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
+  int i, j, k;
+  int error_count = 0;
+
+  for (j = 0; j < height; j++) {
+    if (error_count < iListLength) {
+      printf("\n  Row %d:\n", j);
+    }
+
+    for (i = 0; i < width; i++) {
+      k = j * width + i;
+      float fDiff = fabs(data1[k] - data2[k]);
+
+      if (fDiff > fListTol) {
+        if (error_count < iListLength) {
+          printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j,
+                 data1[k], data2[k], fDiff);
+        }
+
+        error_count++;
+      }
+    }
+  }
+
+  printf(" \n  Total Errors = %d\n", error_count);
+}
+
+void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
+                    sMatrixSize &matrix_size) try {
+  // By default, we use device 0, otherwise we override the device ID based on
+  // what is provided at the command line
+  int error;
+  devID = 0;
+
+  devID = findCudaDevice(argc, (const char **)argv);
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) {
+    iSizeMultiple =
+        getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
+  }
+
+  iSizeMultiple = min(iSizeMultiple, 10);
+  iSizeMultiple = max(iSizeMultiple, 1);
+
+  dpct::device_info deviceProp;
+
+  /*
+  DPCT1003:20: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  error =
+      (dpct::dev_mgr::instance().get_device(devID).get_device_info(deviceProp),
+       0);
+
+  printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
+         /*
+         DPCT1005:21: The SYCL device version is different from CUDA Compute
+         Compatibility. You may need to rewrite this code.
+         */
+         deviceProp.get_name(), deviceProp.get_major_version(),
+         deviceProp.get_minor_version());
+
+  int block_size = 32;
+
+  matrix_size.uiWA = 3 * block_size * iSizeMultiple;
+  matrix_size.uiHA = 4 * block_size * iSizeMultiple;
+  matrix_size.uiWB = 2 * block_size * iSizeMultiple;
+  matrix_size.uiHB = 3 * block_size * iSizeMultiple;
+  matrix_size.uiWC = 2 * block_size * iSizeMultiple;
+  matrix_size.uiHC = 4 * block_size * iSizeMultiple;
+
+  printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiHA,
+         matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC,
+         matrix_size.uiWC);
+
+  if (matrix_size.uiWA != matrix_size.uiHB ||
+      matrix_size.uiHA != matrix_size.uiHC ||
+      matrix_size.uiWB != matrix_size.uiWC) {
+    printf("ERROR: Matrix sizes do not match!\n");
+    exit(-1);
+  }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test matrix multiply using CUBLAS
+////////////////////////////////////////////////////////////////////////////////
+int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
+  dpct::device_info deviceProp;
+
+  /*
+  DPCT1003:22: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors(
+      (dpct::dev_mgr::instance().get_device(devID).get_device_info(deviceProp),
+       0));
+
+  int block_size = 32;
+
+  // set seed for rand()
+  srand(2006);
+
+  // allocate host memory for matrices A and B
+  unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
+  unsigned int mem_size_A = sizeof(float) * size_A;
+  float *h_A = (float *)malloc(mem_size_A);
+  unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
+  unsigned int mem_size_B = sizeof(float) * size_B;
+  float *h_B = (float *)malloc(mem_size_B);
+
+  // set seed for rand()
+  srand(2006);
+
+  // initialize host memory
+  randomInit(h_A, size_A);
+  randomInit(h_B, size_B);
+
+  // allocate device memory
+  float *d_A, *d_B, *d_C;
+  unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
+  unsigned int mem_size_C = sizeof(float) * size_C;
+
+  // allocate host memory for the result
+  float *h_C = (float *)malloc(mem_size_C);
+  float *h_CUBLAS = (float *)malloc(mem_size_C);
+
+  /*
+  DPCT1003:23: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((
+      d_A = (float *)sycl::malloc_device(mem_size_A, dpct::get_default_queue()),
+      0));
+  /*
+  DPCT1003:24: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((
+      d_B = (float *)sycl::malloc_device(mem_size_B, dpct::get_default_queue()),
+      0));
+  /*
+  DPCT1003:25: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors(
+      (dpct::get_default_queue().memcpy(d_A, h_A, mem_size_A).wait(), 0));
+  /*
+  DPCT1003:26: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors(
+      (dpct::get_default_queue().memcpy(d_B, h_B, mem_size_B).wait(), 0));
+  /*
+  DPCT1003:27: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((
+      d_C = (float *)sycl::malloc_device(mem_size_C, dpct::get_default_queue()),
+      0));
+
+  // setup execution parameters
+  sycl::range<3> threads(1, block_size, block_size);
+  sycl::range<3> grid(1, matrix_size.uiHC / threads[1],
+                      matrix_size.uiWC / threads[2]);
+
+  // create and start timer
+  printf("Computing result using CUBLAS...");
+
+  // execute the kernel
+  int nIter = 30;
+
+  // CUBLAS version 2.0
+  {
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    sycl::queue *handle;
+    dpct::event_ptr start, stop;
+    std::chrono::time_point<std::chrono::steady_clock> start_ct1;
+    std::chrono::time_point<std::chrono::steady_clock> stop_ct1;
+
+    /*
+    DPCT1003:28: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((handle = &dpct::get_default_queue(), 0));
+
+    // Perform warmup operation with cublas
+    /*
+    DPCT1003:29: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors(
+        (oneapi::mkl::blas::column_major::gemm(
+             *handle, oneapi::mkl::transpose::nontrans,
+             oneapi::mkl::transpose::nontrans, matrix_size.uiWB,
+             matrix_size.uiHA, matrix_size.uiWA, alpha, d_B, matrix_size.uiWB,
+             d_A, matrix_size.uiWA, beta, d_C, matrix_size.uiWB),
+         0));
+
+    // Allocate CUDA events that we'll use for timing
+    /*
+    DPCT1003:30: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((start = new sycl::event(), 0));
+    /*
+    DPCT1003:31: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((stop = new sycl::event(), 0));
+
+    // Record the start event
+    /*
+    DPCT1012:32: Detected kernel execution time measurement pattern and
+    generated an initial code for time measurements in SYCL. You can change the
+    way time is measured depending on your goals.
+    */
+    /*
+    DPCT1024:33: The original code returned the error code that was further
+    consumed by the program logic. This original code was replaced with 0. You
+    may need to rewrite the program logic consuming the error code.
+    */
+    start_ct1 = std::chrono::steady_clock::now();
+    checkCudaErrors(0);
+
+    for (int j = 0; j < nIter; j++) {
+      // note cublas is column primary!
+      // need to transpose the order
+      /*
+      DPCT1003:34: Migrated API does not return error code. (*, 0) is inserted.
+      You may need to rewrite this code.
+      */
+      checkCudaErrors(
+          (oneapi::mkl::blas::column_major::gemm(
+               *handle, oneapi::mkl::transpose::nontrans,
+               oneapi::mkl::transpose::nontrans, matrix_size.uiWB,
+               matrix_size.uiHA, matrix_size.uiWA, alpha, d_B, matrix_size.uiWB,
+               d_A, matrix_size.uiWA, beta, d_C, matrix_size.uiWB),
+           0));
+    }
+
+    printf("done.\n");
+
+    // Record the stop event
+    /*
+    DPCT1012:35: Detected kernel execution time measurement pattern and
+    generated an initial code for time measurements in SYCL. You can change the
+    way time is measured depending on your goals.
+    */
+    /*
+    DPCT1024:36: The original code returned the error code that was further
+    consumed by the program logic. This original code was replaced with 0. You
+    may need to rewrite the program logic consuming the error code.
+    */
+    stop_ct1 = std::chrono::steady_clock::now();
+    checkCudaErrors(0);
+
+    // Wait for the stop event to complete
+    checkCudaErrors(0);
+
+    float msecTotal = 0.0f;
+    /*
+    DPCT1003:37: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((msecTotal = std::chrono::duration<float, std::milli>(
+                                     stop_ct1 - start_ct1)
+                                     .count(),
+                     0));
+
+    // Compute and print the performance
+    float msecPerMatrixMul = msecTotal / nIter;
+    double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC *
+                               (double)matrix_size.uiWC *
+                               (double)matrix_size.uiHB;
+    double gigaFlops =
+        (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
+           gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
+
+    // copy result from device to host
+    /*
+    DPCT1003:38: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((
+        dpct::get_default_queue().memcpy(h_CUBLAS, d_C, mem_size_C).wait(), 0));
+
+    // Destroy the handle
+    /*
+    DPCT1003:39: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((handle = nullptr, 0));
+  }
+
+  // compute reference solution
+  printf("Computing result using host CPU...");
+  float *reference = (float *)malloc(mem_size_C);
+  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA,
+               matrix_size.uiWB);
+  printf("done.\n");
+
+  // check result (CUBLAS)
+  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
+
+  if (resCUBLAS != true) {
+    printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100,
+              1.0e-5f);
+  }
+
+  printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n",
+         (true == resCUBLAS) ? "PASS" : "FAIL");
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+
+  // clean up memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+  free(reference);
+  /*
+  DPCT1003:40: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((sycl::free(d_A, dpct::get_default_queue()), 0));
+  /*
+  DPCT1003:41: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((sycl::free(d_B, dpct::get_default_queue()), 0));
+  /*
+  DPCT1003:42: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((sycl::free(d_C, dpct::get_default_queue()), 0));
+
+  if (resCUBLAS == true) {
+    return EXIT_SUCCESS;  // return value = 1
+  } else {
+    return EXIT_FAILURE;  // return value = 0
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  printf("[Matrix Multiply CUBLAS] - Starting...\n");
+
+  int devID = 0, sizeMult = 5;
+  sMatrixSize matrix_size;
+
+  initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
+
+  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
+
+  return matrix_result;
+}
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/CMakeLists.txt b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/CMakeLists.txt
new file mode 100644
index 0000000000..f3cd0a8f89
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -std=c++17")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core")
+include_directories(${CMAKE_SOURCE_DIR}/02_sycl_dpct_migrated/Common/)
+
+add_executable (02_sycl_dpct_migrated src/matrixMulCUBLAS.cpp)
+target_link_libraries(02_sycl_dpct_migrated sycl)
+
+add_custom_target (run_matrix_mul cd ${CMAKE_SOURCE_DIR}/02_sycl_dpct_migrated/ && ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/02_sycl_dpct_migrated)
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/exception.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/exception.h
new file mode 100644
index 0000000000..ca8ac25258
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_cuda.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_cuda.h
new file mode 100644
index 0000000000..396729118e
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_cuda.h
@@ -0,0 +1,1051 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DPCT_HPP__
+static const char *_cudaGetErrorEnum(int error) {
+  /*
+  DPCT1009:0: SYCL uses exceptions to report errors and does not use the error
+  codes. The original code was commented out and a warning string was inserted.
+  You need to rewrite this code.
+  */
+  return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/;
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(int error) {
+  switch (error) {
+    case 0:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case 1:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case 3:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case 7:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case 8:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case 11:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case 13:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case 14:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case 15:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case 16:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(int error) {
+  switch (error) {
+    case 0:
+      return "CURAND_STATUS_SUCCESS";
+
+    case 100:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case 101:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case 102:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case 103:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case 104:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case 105:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case 106:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case 201:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case 202:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case 203:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case 204:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case 999:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+}
+
+#ifdef __DPCT_HPP__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  /*
+  DPCT1010:1: SYCL uses exceptions to report errors and does not use the error
+  codes. The call was replaced with 0. You need to rewrite this code.
+  */
+  int err = 0;
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  /*
+  DPCT1010:3: SYCL uses exceptions to report errors and does not use the error
+  codes. The call was replaced with 0. You need to rewrite this code.
+  */
+  int err = 0;
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct dpct_type_168589 {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {0x90, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct dpct_type_127073 {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {0x90, "Hopper"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __DPCT_HPP__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  /*
+  DPCT1003:5: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  /*
+  DPCT1035:6: All SYCL devices can be used by host to submit tasks. You may need
+  to adjust this code.
+  */
+  checkCudaErrors((computeMode = 1, 0));
+  checkCudaErrors(
+      (major = dpct::dev_mgr::instance().get_device(devID).get_major_version(),
+       0));
+  checkCudaErrors(
+      (minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(),
+       0));
+  /*
+  DPCT1035:7: All SYCL devices can be used by host to submit tasks. You may need
+  to adjust this code.
+  */
+  if (computeMode == 0) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  /*
+  DPCT1093:8: The "devID" may not be the best XPU device. Adjust the selected
+  device if needed.
+  */
+  /*
+  DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((dpct::select_device(devID), 0));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() try {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  /*
+  DPCT1003:10: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    /*
+    DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    checkCudaErrors((computeMode = 1, 0));
+    checkCudaErrors((major = dpct::dev_mgr::instance()
+                                 .get_device(current_device)
+                                 .get_major_version(),
+                     0));
+    checkCudaErrors((minor = dpct::dev_mgr::instance()
+                                 .get_device(current_device)
+                                 .get_minor_version(),
+                     0));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    /*
+    DPCT1035:12: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    if (computeMode != 0) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors((multiProcessorCount = dpct::dev_mgr::instance()
+                                                 .get_device(current_device)
+                                                 .get_max_compute_units(),
+                       0));
+      int result = (clockRate = dpct::dev_mgr::instance()
+                                    .get_device(current_device)
+                                    .get_max_clock_frequency(),
+                    0);
+
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    /*
+    DPCT1093:13: The "devID" may not be the best XPU device. Adjust the selected
+    device if needed.
+    */
+    /*
+    DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted.
+    You may need to rewrite this code.
+    */
+    checkCudaErrors((dpct::select_device(devID), 0));
+    int major = 0, minor = 0;
+    checkCudaErrors((
+        major = dpct::dev_mgr::instance().get_device(devID).get_major_version(),
+        0));
+    checkCudaErrors((
+        minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(),
+        0));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  /*
+  DPCT1003:15: Migrated API does not return error code. (*, 0) is inserted. You
+  may need to rewrite this code.
+  */
+  checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    /*
+    DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    checkCudaErrors((computeMode = 1, 0));
+    checkCudaErrors((integrated = dpct::dev_mgr::instance()
+                                      .get_device(current_device)
+                                      .get_integrated(),
+                     0));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    /*
+    DPCT1035:17: All SYCL devices can be used by host to submit tasks. You may
+    need to adjust this code.
+    */
+    if (integrated && (computeMode != 0)) {
+      /*
+      DPCT1093:18: The "current_device" may not be the best XPU device. Adjust
+      the selected device if needed.
+      */
+      /*
+      DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted.
+      You may need to rewrite this code.
+      */
+      checkCudaErrors((dpct::select_device(current_device), 0));
+
+      int major = 0, minor = 0;
+      checkCudaErrors((major = dpct::dev_mgr::instance()
+                                   .get_device(current_device)
+                                   .get_major_version(),
+                       0));
+      checkCudaErrors((minor = dpct::dev_mgr::instance()
+                                   .get_device(current_device)
+                                   .get_minor_version(),
+                       0));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id());
+  checkCudaErrors(
+      (major = dpct::dev_mgr::instance().get_device(dev).get_major_version(),
+       0));
+  checkCudaErrors(
+      (minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(),
+       0));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_functions.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_functions.h
new file mode 100644
index 0000000000..2975ddba6a
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_image.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_image.h
new file mode 100644
index 0000000000..d093d16a0e
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_image.h
@@ -0,0 +1,1001 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include "helper_string.h"
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float &val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+  FILE *fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#') {
+      continue;
+    }
+
+    if (i == 0) {
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    } else if (i == 1) {
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    } else if (i == 2) {
+      i += SSCANF(header, "%u", &maxval);
+    }
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height) {
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+    }
+  } else {
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                    *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+      0) {
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+  }
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+  unsigned char *idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels)) {
+    return false;
+  }
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data) {
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+  }
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+    fh << data[i];
+  }
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char *ndata =
+      (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char *ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE *fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE *fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+    fh << data[i] << ' ';
+  }
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp) {
+      error_count++;
+    }
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count) {
+      printf("total # of errors = %d\n", error_count);
+    }
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE *fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf(
+          "compareBin2Bin <unsigned int>"
+          " unable to open ref_file: %s\n",
+          ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf(
+          "> compareBin2Bin <unsigned int> nelements=%d,"
+          " epsilon=%4.2f, threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+      printf(
+          "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+          " threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_string.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_string.h
new file mode 100644
index 0000000000..47fb1ac1fa
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_timer.h b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_timer.h
new file mode 100644
index 0000000000..8ebce43598
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/Common/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/src/matrixMulCUBLAS.cpp b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/src/matrixMulCUBLAS.cpp
new file mode 100644
index 0000000000..50ecbd820c
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/02_sycl_dpct_migrated/src/matrixMulCUBLAS.cpp
@@ -0,0 +1,376 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+* Matrix multiplication: C = A * B.
+* Host code.
+*
+* This sample implements matrix multiplication as described in Chapter 3
+* of the programming guide and uses the CUBLAS library to demonstrate
+* the best performance.
+* SOME PRECAUTIONS:
+* IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
+* WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
+* The reason is explained as follows:
+* CUBLAS library uses column-major storage, but C/C++ use row-major storage.
+* When passing the matrix pointer to CUBLAS, the memory layout alters from
+* row-major to column-major, which is equivalent to an implicit transpose.
+* In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
+* C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
+* implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
+* If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
+* multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
+* is a column-based cublas matrix, which means C(T) in C/C++, we need extra
+* transpose code to convert it to a row-based C/C++ matrix.
+* To solve the problem, let's consider our desired result C, a row-major matrix.
+* In cublas format, it is C(T) actually (because of the implicit transpose).
+* C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
+* happen to be C/C++ matrice B and A (still because of the implicit transpose)!
+* We don't need extra transpose code, we only need alter the input order!
+*
+* CUBLAS provides high-performance matrix multiplication.
+* See also:
+* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
+* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
+* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
+*/
+
+// Utilities and system includes
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <assert.h>
+#include <helper_string.h>
+#include <oneapi/mkl.hpp>
+#include <dpct/blas_utils.hpp>
+// helper for shared functions common to CUDA Samples
+
+// CUDA runtime
+
+// CUDA and CUBLAS functions
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#include <cmath>
+
+#include <chrono>
+
+#ifndef min
+#define min(a, b) ((a < b) ? a : b)
+#endif
+#ifndef max
+#define max(a, b) ((a > b) ? a : b)
+#endif
+
+dpct::device_ext &dev_ct1 = dpct::get_current_device();
+sycl::queue &q_ct1 = dev_ct1.default_queue();
+
+// Optional Command-line multiplier for matrix sizes
+typedef struct _matrixSize {
+  unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
+} sMatrixSize;
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute reference data set matrix multiply on CPU
+//! C = A * B
+//! @param C          reference data, computed but preallocated
+//! @param A          matrix A as provided to device
+//! @param B          matrix B as provided to device
+//! @param hA         height of matrix A
+//! @param wB         width of matrix B
+////////////////////////////////////////////////////////////////////////////////
+void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA,
+                  unsigned int wA, unsigned int wB) {
+  for (unsigned int i = 0; i < hA; ++i)
+    for (unsigned int j = 0; j < wB; ++j) {
+      double sum = 0;
+
+      for (unsigned int k = 0; k < wA; ++k) {
+        double a = A[i * wA + k];
+        double b = B[k * wB + j];
+        sum += a * b;
+      }
+
+      C[i * wB + j] = (float)sum;
+    }
+}
+
+// Allocates a matrix with random float entries.
+void randomInit(float *data, int size) {
+  for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX;
+}
+
+void printDiff(float *data1, float *data2, int width, int height,
+               int iListLength, float fListTol) {
+  printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
+  int i, j, k;
+  int error_count = 0;
+
+  for (j = 0; j < height; j++) {
+    if (error_count < iListLength) {
+      printf("\n  Row %d:\n", j);
+    }
+
+    for (i = 0; i < width; i++) {
+      k = j * width + i;
+      float fDiff = fabs(data1[k] - data2[k]);
+
+      if (fDiff > fListTol) {
+        if (error_count < iListLength) {
+          printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j,
+                 data1[k], data2[k], fDiff);
+        }
+
+        error_count++;
+      }
+    }
+  }
+
+  printf(" \n  Total Errors = %d\n", error_count);
+}
+
+void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
+                    sMatrixSize &matrix_size) try {
+  // By default, we use device 0, otherwise we override the device ID based on
+  // what is provided at the command line
+  int error;
+  devID = 0;
+
+  // devID = findCudaDevice(argc, (const char **)argv);
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) {
+    iSizeMultiple =
+        getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
+  }
+
+  iSizeMultiple = min(iSizeMultiple, 10);
+  iSizeMultiple = max(iSizeMultiple, 1);
+
+  dpct::device_info deviceProp;
+
+  dpct::dev_mgr::instance().get_device(devID).get_device_info(deviceProp);
+
+  printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
+         deviceProp.get_name(), deviceProp.get_major_version(),
+         deviceProp.get_minor_version());
+
+  int block_size = 32;
+
+  matrix_size.uiWA = 3 * block_size * iSizeMultiple;
+  matrix_size.uiHA = 4 * block_size * iSizeMultiple;
+  matrix_size.uiWB = 2 * block_size * iSizeMultiple;
+  matrix_size.uiHB = 3 * block_size * iSizeMultiple;
+  matrix_size.uiWC = 2 * block_size * iSizeMultiple;
+  matrix_size.uiHC = 4 * block_size * iSizeMultiple;
+
+  printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiHA,
+         matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC,
+         matrix_size.uiWC);
+
+  if (matrix_size.uiWA != matrix_size.uiHB ||
+      matrix_size.uiHA != matrix_size.uiHC ||
+      matrix_size.uiWB != matrix_size.uiWC) {
+    printf("ERROR: Matrix sizes do not match!\n");
+    exit(-1);
+  }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test matrix multiply using CUBLAS
+////////////////////////////////////////////////////////////////////////////////
+int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
+  dpct::device_info deviceProp;
+
+  dpct::dev_mgr::instance().get_device(devID).get_device_info(deviceProp);
+
+  int block_size = 32;
+
+  // set seed for rand()
+  srand(2006);
+
+  // allocate host memory for matrices A and B
+  unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
+  unsigned int mem_size_A = sizeof(float) * size_A;
+  float *h_A = (float *)malloc(mem_size_A);
+  unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
+  unsigned int mem_size_B = sizeof(float) * size_B;
+  float *h_B = (float *)malloc(mem_size_B);
+
+  // set seed for rand()
+  srand(2006);
+
+  // initialize host memory
+  randomInit(h_A, size_A);
+  randomInit(h_B, size_B);
+
+  // allocate device memory
+  float *d_A, *d_B, *d_C;
+  unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
+  unsigned int mem_size_C = sizeof(float) * size_C;
+
+  // allocate host memory for the result
+  float *h_C = (float *)malloc(mem_size_C);
+  float *h_CUBLAS = (float *)malloc(mem_size_C);
+
+  d_A = (float *)sycl::malloc_device(mem_size_A, dpct::get_default_queue());
+  d_B = (float *)sycl::malloc_device(mem_size_B, dpct::get_default_queue());
+
+  dpct::get_default_queue().memcpy(d_A, h_A, mem_size_A).wait();
+  dpct::get_default_queue().memcpy(d_B, h_B, mem_size_B).wait();
+  
+  d_C = (float *)sycl::malloc_device(mem_size_C, dpct::get_default_queue());
+
+  // setup execution parameters
+  sycl::range<3> threads(1, block_size, block_size);
+  sycl::range<3> grid(1, matrix_size.uiHC / threads[1],
+                      matrix_size.uiWC / threads[2]);
+
+  // create and start timer
+  printf("Computing result using CUBLAS...");
+
+  // execute the kernel
+  int nIter = 30;
+
+  // CUBLAS version 2.0
+  {
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    sycl::queue *handle;
+    dpct::event_ptr start, stop;
+    std::chrono::time_point<std::chrono::steady_clock> start_ct1;
+    std::chrono::time_point<std::chrono::steady_clock> stop_ct1;
+
+    handle = &dpct::get_default_queue();
+
+    // Perform warmup operation with cublas
+    oneapi::mkl::blas::column_major::gemm(
+             *handle, oneapi::mkl::transpose::nontrans,
+             oneapi::mkl::transpose::nontrans, matrix_size.uiWB,
+             matrix_size.uiHA, matrix_size.uiWA, alpha, d_B, matrix_size.uiWB,
+             d_A, matrix_size.uiWA, beta, d_C, matrix_size.uiWB);
+
+    // Allocate CUDA events that we'll use for timing
+    start = new sycl::event();
+    stop = new sycl::event();
+
+    // Record the start event
+    start_ct1 = std::chrono::steady_clock::now();
+
+    for (int j = 0; j < nIter; j++) {
+      // note cublas is column primary!
+      // need to transpose the order
+      oneapi::mkl::blas::column_major::gemm(
+               *handle, oneapi::mkl::transpose::nontrans,
+               oneapi::mkl::transpose::nontrans, matrix_size.uiWB,
+               matrix_size.uiHA, matrix_size.uiWA, alpha, d_B, matrix_size.uiWB,
+               d_A, matrix_size.uiWA, beta, d_C, matrix_size.uiWB);
+    }
+
+    printf("done.\n");
+
+    // Record the stop event
+    stop_ct1 = std::chrono::steady_clock::now();
+
+    // Wait for the stop event to complete
+    float msecTotal = 0.0f;
+    msecTotal = std::chrono::duration<float, std::milli>(
+                                     stop_ct1 - start_ct1)
+                                     .count();
+
+    // Compute and print the performance
+    float msecPerMatrixMul = msecTotal / nIter;
+    double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC *
+                               (double)matrix_size.uiWC *
+                               (double)matrix_size.uiHB;
+    double gigaFlops =
+        (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
+           gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
+
+    // copy result from device to host
+    dpct::get_default_queue().memcpy(h_CUBLAS, d_C, mem_size_C).wait();
+
+    // Destroy the handle
+    handle = nullptr;
+  }
+
+  // compute reference solution
+  printf("Computing result using host CPU...");
+  float *reference = (float *)malloc(mem_size_C);
+  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA,
+               matrix_size.uiWB);
+  printf("done.\n");
+
+  // check result (CUBLAS)
+  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
+
+  if (resCUBLAS != true) {
+    printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100,
+              1.0e-5f);
+  }
+
+  printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n",
+         (true == resCUBLAS) ? "PASS" : "FAIL");
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+
+  // clean up memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+  free(reference);
+  sycl::free(d_A, dpct::get_default_queue());
+  sycl::free(d_B, dpct::get_default_queue());
+  sycl::free(d_C, dpct::get_default_queue());
+
+  if (resCUBLAS == true) {
+    return EXIT_SUCCESS;  // return value = 1
+  } else {
+    return EXIT_FAILURE;  // return value = 0
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  printf("[Matrix Multiply CUBLAS] - Starting...\n");
+
+  int devID = 0, sizeMult = 5;
+  sMatrixSize matrix_size;
+
+  initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
+
+  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
+
+  return matrix_result;
+}
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/CMakeLists.txt b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/CMakeLists.txt
new file mode 100644
index 0000000000..02c234ea83
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required (VERSION 3.4.0)
+
+set(CMAKE_CXX_COMPILER "icpx")
+
+project (guided_mamtrix_mul_cuBLAS_SYCL_Migration)
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+	message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info")
+	set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
+	     STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+	     FORCE)
+endif ()
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+add_subdirectory (02_sycl_dpct_migrated)
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/License.txt b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/License.txt
new file mode 100644
index 0000000000..9cde07f558
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/License.txt
@@ -0,0 +1,8 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/README.md b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/README.md
new file mode 100644
index 0000000000..f60ebdc8b4
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/README.md
@@ -0,0 +1,157 @@
+# `Matrix Multiplication cuBLAS Migrated` Sample
+
+The `Matrix Multiplication cuBLAS Migrated` sample is a sample illustrating GPU performance for matrix multiplication. This sample also shows how to use the migrated code in a high-performance performance way for matrix multiplication.
+
+| Area                   | Description
+|:---                    |:---
+| What you will learn    | How to begin migrating CUDA code to a SYCL*-compliant equivalent
+| Time to complete       | 90 minutes
+| Category               | Code Optimization
+
+For more information on oneMKL and complete documentation of all oneMKL routines, see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-documentation.html.
+
+## Purpose
+
+The sample source code using SYCL was migrated from CUDA source code for offloading computations to a GPU/CPU. The sample demonstrates how to migrate code to SYCL, optimize the migration steps, and improve processing time.
+
+The sample source files show the usage of oneMKL and performance for GPU/CPU.
+
+>**Note**: This sample is based on the [*CUDA Samples - CUDA Libraries*](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/4_CUDA_Libraries/matrixMulCUBLAS) sample in the NVIDIA/cuda-samples GitHub repository.
+
+## Prerequisites
+
+| Optimized for         | Description
+|:---                   |:---
+| OS                    | Ubuntu* 20.04
+| Hardware              | 10th Gen Intel® processors or newer
+| Software              | Intel® oneAPI DPC++/C++ Compiler
+
+## Key Implementation Details
+
+This sample contains two sets of sources in the following folders:
+
+| Folder Name             | Description
+|:---                     |:---
+| `01_sycl_dpct_output`   | Contains output of Intel® DPC++ Compatibility Tool used to migrate SYCL-compliant code from CUDA code. <br> This SYCL code has some unmigrated or incorrectly generated code that has to be manually fixed before it is functional. (The code does not work as supplied.)
+| `02_sycl_dpct_migrated` | Contains SYCL to CUDA migrated code generated by using the Intel® DPC++ Compatibility Tool with the manual changes implemented to make the code fully functional.
+
+## Set Environment Variables
+
+When working with the command-line interface (CLI), you should configure the oneAPI toolkits using environment variables. Set up your CLI environment by sourcing the `setvars` script every time you open a new terminal window. This practice ensures that your compiler, libraries, and tools are ready for development.
+
+## Build the `Matrix Multiplication cuBLAS Migrated` Sample
+
+> **Note**: If you have not already done so, set up your CLI
+> environment by sourcing  the `setvars` script in the root of your oneAPI installation.
+>
+> Linux*:
+> - For system wide installations: `. /opt/intel/oneapi/setvars.sh`
+> - For private installations: ` . ~/intel/oneapi/setvars.sh`
+> - For non-POSIX shells, like csh, use the following command: `bash -c 'source <install-dir>/setvars.sh ; exec csh'`
+>
+> For more information on configuring environment variables, see *[Use the setvars Script with Linux* or macOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html)*.
+
+### On Linux*
+
+1. Change to the sample directory.
+2. Build the samples.
+   ```
+   $ mkdir build
+   $ cd build
+   $ cmake ..
+   $ make
+   ```
+
+   By default, this command sequence builds the version of the source code in the  `02_sycl_dpct_migrated` folder.
+
+#### Troubleshooting
+
+If an error occurs, you can get more details by running `make` with
+the `VERBOSE=1` argument:
+```
+make VERBOSE=1
+```
+If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the [Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html) for more information on using the utility.
+
+
+## Run the `Matrix Multiplication cuBLAS Migrated` Sample
+
+### On Linux
+
+Run the programs on a CPU or GPU. Each sample uses a default device, which in most cases is a GPU.
+
+1. Run the samples in the `02_sycl_dpct_migrated` folder.
+   ```
+   make run_matrix_mul
+   ```
+
+### Build and Run the `Matrix Multiplication cuBLAS Migrated` Sample in Intel® DevCloud (Optional)
+
+When running a sample in the Intel® DevCloud, you must specify the compute node (CPU, GPU, FPGA) and whether to run in batch or interactive mode. For more information, see the Intel® oneAPI Base Toolkit [Get Started Guide](https://devcloud.intel.com/oneapi/get_started/).
+
+#### Build and Run Samples in Batch Mode (Optional)
+
+You can submit build and run jobs through a Portable Bash Script (PBS). A job is a script that submitted to PBS through the `qsub` utility. By default, the `qsub` utility does not inherit the current environment variables or your current working directory, so you might need to submit jobs to configure the environment variables. To indicate the correct working directory, you can use either absolute paths or pass the `-d \<dir\>` option to `qsub`.
+
+1. Open a terminal on a Linux* system.
+2. Log in to Intel® DevCloud.
+   ```
+   ssh devcloud
+   ```
+3. Download the samples.
+   ```
+   git clone https://github.com/oneapi-src/oneAPI-samples.git
+   ```
+4. Change to the sample directory.
+5. Configure the sample for a GPU node and choose the backend as OpenCL.
+   ```
+   qsub  -I  -l nodes=1:gpu:ppn=2 -d .
+   export SYCL_DEVICE_FILTER=opencl:gpu
+   ```
+   - `-I` (upper case I) requests an interactive session.
+   - `-l nodes=1:gpu:ppn=2` (lower case L) assigns one full GPU node.
+   - `-d .` makes the current folder as the working directory for the task.
+
+     |Available Nodes  |Command Options
+     |:---             |:---
+     | GPU	           |`qsub -l nodes=1:gpu:ppn=2 -d .`
+     | CPU	           |`qsub -l nodes=1:xeon:ppn=2 -d .`
+
+6. Perform build steps as you would on Linux.
+7. Run the programs.
+8. Clean up the project files.
+   ```
+   make clean
+   ```
+9. Disconnect from the Intel® DevCloud.
+   ```
+   exit
+   ```
+
+## Example Output
+
+This is example output if you built the default and ran `run_matrix_mul`.
+
+```
+[ 50%] Building CXX object 02_sycl_dpct_migrated/CMakeFiles/02_sycl_dpct_migrated.dir/src/matrixMulCUBLAS.cpp.o
+[100%] Linking CXX executable ../bin/02_sycl_dpct_migrated
+[100%] Built target 02_sycl_dpct_migrated
+[Matrix Multiply CUBLAS] - Starting...
+GPU Device 0: "Intel(R) Core(TM) i7-10610U CPU @ 1.80GHz" with compute capability 3.0
+
+MatrixA(640,480), MatrixB(480,320), MatrixC(640,320)
+Computing result using CUBLAS...done.
+Performance= 3676.94 GFlop/s, Time= 0.053 msec, Size= 196608000 Ops
+Computing result using host CPU...done.
+Comparing CUBLAS Matrix Multiply with CPU results: PASS
+
+NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
+Built target run_matrix_mul
+```
+
+## License
+
+Code samples are licensed under the MIT license. See
+[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+Third party program licenses are at [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt).
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/sample.json b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/sample.json
new file mode 100644
index 0000000000..b3b78ddfe4
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/sample.json
@@ -0,0 +1,25 @@
+{
+  "guid": "244EEBAA-F3A0-46FD-9305-26DEDAA48C90",	
+  "name": "Matrix Multiplication cuBLAS Migrated",
+  "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL"],
+  "description": "This sample shows the migration of a more complex matrix multiplication sample from cuBLAS to SYCL.",
+  "toolchain": [ "dpcpp" ],
+  "dependencies": [ "mkl" ],
+  "languages": [ { "cpp": {} } ],
+  "targetDevice": [ "CPU", "GPU" ],
+  "os": [ "linux" ],
+  "builder": [ "cmake" ],
+  "ciTests": {
+	"linux": [{
+		"steps": [
+			"mkdir build",
+      "cd build",
+      "cmake ..",
+      "make",
+			"make run_matrix_mul"
+		 ]
+	}]
+
+  }
+  "expertise": "Code Optimization"
+}
\ No newline at end of file
diff --git a/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/third-party-programs.txt b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/third-party-programs.txt
new file mode 100644
index 0000000000..1fac18875f
--- /dev/null
+++ b/DirectProgramming/C++SYCL/DenseLinearAlgebra/guided_matrix_mul_SYCLMigration/third-party-programs.txt
@@ -0,0 +1,518 @@
+oneAPI Code Samples - Third Party Programs File
+
+This file contains the list of third party software ("third party programs")
+contained in the Intel software and their required notices and/or license
+terms. This third party software, even if included with the distribution of the
+Intel software, may be governed by separate license terms, including without
+limitation, third party license terms, other Intel software license terms, and
+open source software license terms. These separate license terms govern your use
+of the third party programs as set forth in the “third-party-programs.txt” or
+other similarly named text file.
+ 
+Third party programs and their corresponding required notices and/or license
+terms are listed below.
+
+--------------------------------------------------------------------------------
+1. n-digit-mnist
+
+Apache License 2.0
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+--------------------------------------------------------------------------------
+2. GNU-EFI
+   Copyright (c) 1998-2000 Intel Corporation
+
+The files in the "lib" and "inc" subdirectories are using the EFI Application 
+Toolkit distributed by Intel at http://developer.intel.com/technology/efi
+
+This code is covered by the following agreement:
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. THE EFI SPECIFICATION AND ALL OTHER INFORMATION
+ON THIS WEB SITE ARE PROVIDED "AS IS" WITH NO WARRANTIES, AND ARE SUBJECT
+TO CHANGE WITHOUT NOTICE.
+
+--------------------------------------------------------------------------------
+3. Edk2
+   Copyright (c) 2019, Intel Corporation.  All rights reserved.
+
+   Edk2 Basetools
+   Copyright (c) 2019, Intel Corporation.  All rights reserved.
+
+SPDX-License-Identifier: BSD-2-Clause-Patent
+
+--------------------------------------------------------------------------------
+4. Cuda-Samples
+   Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+5. Rodinia
+   Copyright (c)2008-2011 University of Virginia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use this software or a modified version of it, please cite the most relevant among the following papers:
+
+ - M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 
+Symposium on Computer Architecture (ISCA), June 2010.
+
+ - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron.
+Rodinia: A Benchmark Suite for Heterogeneous Computing. IEEE International Symposium
+on Workload Characterization, Oct 2009.
+
+- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization
+for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International
+Conference on Supercomputing (ICS), June 2009.
+
+- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems
+Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International
+Symposium on Computer Architecture (ISCA), June 2009.
+
+- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA:
+A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel
+and Distributed Processing Symposium (IPDPS), May 2009.
+
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance
+Study of General Purpose Applications on Graphics Processors using CUDA" Journal of
+Parallel and Distributed Computing, Elsevier, June 2008.
+--------------------------------------------------------------------------------
+6. Intel® Implicit SPMD Program Compiler (Intel® ISPC) - Renderkit samples
+   Copyright Intel Corporation
+   All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+--------------------------------------------------------------------------------
+7. Heat Transmission
+
+GNU LESSER GENERAL PUBLIC LICENSE
+Version 3, 29 June 2007
+
+Copyright © 2007 Free Software Foundation, Inc. <https://fsf.org/>
+
+Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+
+This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below.
+
+0. Additional Definitions.
+As used herein, “this License” refers to version 3 of the GNU Lesser General Public License, and the “GNU GPL” refers to version 3 of the GNU General Public License.
+
+“The Library” refers to a covered work governed by this License, other than an Application or a Combined Work as defined below.
+
+An “Application” is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library.
+
+A “Combined Work” is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the “Linked Version”.
+
+The “Minimal Corresponding Source” for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version.
+
+The “Corresponding Application Code” for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work.
+
+1. Exception to Section 3 of the GNU GPL.
+You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL.
+
+2. Conveying Modified Versions.
+If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version:
+
+a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or
+b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy.
+3. Object Code Incorporating Material from Library Header Files.
+The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following:
+
+a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License.
+b) Accompany the object code with a copy of the GNU GPL and this license document.
+4. Combined Works.
+You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following:
+
+a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License.
+b) Accompany the Combined Work with a copy of the GNU GPL and this license document.
+c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document.
+d) Do one of the following:
+0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.
+1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version.
+e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.)
+5. Combined Libraries.
+You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following:
+
+a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License.
+b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work.
+6. Revised Versions of the GNU Lesser General Public License.
+The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation.
+
+If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library.
+
+--------------------------------------------------------------------------------
+8. chart.js
+   Copyright (c) 2014-2021 Chart.js Contributors
+
+   color
+   Copyright (c) 2018-2021 Jukka Kurkela
+
+   Microsoft DirectX 11 Toolkit Engine Template: d3d11game_win32
+   copyright 2015-2021 Microsoft Corp.
+
+   Microsoft DirectX 11 Tutorial Wiki
+
+   Nbody
+   (c) 2019 Fabio Baruffa
+
+   Nothings/STB
+   Copyright (c) 2017 Sean Barrett
+
+   Plotly.js
+   Copyright (c) 2020 Plotly, Inc
+
+   pytracing
+   Copyright (c) 2015 Kris Wilson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+9. Stream
+
+***NOTE: This is a modified version of Stream, hence sectin 3b of the license applies.
+
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+
+--------------------------------------------------------------------------------
+10.  FGPA example designs-gzip
+
+    SDL2.0
+
+zlib License
+
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+--------------------------------------------------------------------------------
+The following third party programs have their own third party program files as well. These additional third party program files are as follows:
+
+1. Intel® Implicit SPMD Program Compiler (Intel® ISPC)