diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/CMakeLists.txt b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/CMakeLists.txt new file mode 100644 index 0000000000..fb4854f504 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/CMakeLists.txt @@ -0,0 +1,9 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +include_directories(${CMAKE_SOURCE_DIR}/01_dpct_output/Common/) +include_directories(${CMAKE_SOURCE_DIR}/01_dpct_output/include/) + +add_executable (01_dpct_output Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp) +target_link_libraries(01_dpct_output sycl) + +add_custom_target (run cd ${CMAKE_SOURCE_DIR}/01_dpct_output/ && ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/01_dpct_output) diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/exception.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/exception.h new file mode 100644 index 0000000000..ca8ac25258 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h new file mode 100644 index 0000000000..93ddee5248 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h @@ -0,0 +1,1053 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(int error) { + /* + DPCT1009:4: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:5: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + int err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:7: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + int err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_771618 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x89, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_506579 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x89, "Ada"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + /* + DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:10: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors( + (major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), + 0)); + checkCudaErrors( + (minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), + 0)); + /* + DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:12: The "devID" may not be the best XPU device. Adjust the selected + device if needed. + */ + /* + DPCT1003:13: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(devID), 0)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + /* + DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:15: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors((major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version(), + 0)); + checkCudaErrors((minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version(), + 0)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors((multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units(), + 0)); + int result = (clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency(), + 0); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:17: The "devID" may not be the best XPU device. Adjust the selected + device if needed. + */ + /* + DPCT1003:18: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(devID), 0)); + int major = 0, minor = 0; + checkCudaErrors(( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), + 0)); + checkCudaErrors(( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), + 0)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + /* + DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:20: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors((integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated(), + 0)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:21: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:22: The "current_device" may not be the best XPU device. Adjust + the selected device if needed. + */ + /* + DPCT1003:23: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(current_device), 0)); + + int major = 0, minor = 0; + checkCudaErrors((major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version(), + 0)); + checkCudaErrors((minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version(), + 0)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors( + (major = dpct::dev_mgr::instance().get_device(dev).get_major_version(), + 0)); + checkCudaErrors( + (minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(), + 0)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h.yaml b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h.yaml new file mode 100644 index 0000000000..bcf5416a0d --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_cuda.h.yaml @@ -0,0 +1,889 @@ +--- +MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/dpct_output/Common/helper_cuda.h' +Replacements: + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 1793 + Length: 0 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2210 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2266 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2287 + Length: 0 + ReplacementText: " /*\n DPCT1009:4: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2296 + Length: 23 + ReplacementText: '"cudaGetErrorName is not supported"/*cudaGetErrorName(error)*/' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7109 + Length: 14 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7161 + Length: 21 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7232 + Length: 30 + ReplacementText: '100' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7321 + Length: 29 + ReplacementText: '101' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7408 + Length: 31 + ReplacementText: '102' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7499 + Length: 24 + ReplacementText: '103' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7576 + Length: 26 + ReplacementText: '104' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7657 + Length: 33 + ReplacementText: '105' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7752 + Length: 39 + ReplacementText: '106' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7859 + Length: 28 + ReplacementText: '201' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7944 + Length: 33 + ReplacementText: '202' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8039 + Length: 35 + ReplacementText: '203' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8138 + Length: 27 + ReplacementText: '204' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8221 + Length: 28 + ReplacementText: '999' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 15785 + Length: 199 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 15995 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16461 + Length: 0 + ReplacementText: " /*\n DPCT1010:5: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16463 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16481 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16504 + Length: 259 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17099 + Length: 0 + ReplacementText: " /*\n DPCT1010:7: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17101 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17119 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17142 + Length: 235 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17829 + Length: 0 + ReplacementText: ' dpct_type_771618' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 19036 + Length: 0 + ReplacementText: ' dpct_type_506579' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20194 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20313 + Length: 0 + ReplacementText: " /*\n DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20331 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20960 + Length: 0 + ReplacementText: " /*\n DPCT1035:10: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20978 + Length: 67 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21066 + Length: 72 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21159 + Length: 72 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21234 + Length: 0 + ReplacementText: " /*\n DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21255 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21573 + Length: 0 + ReplacementText: " /*\n DPCT1093:12: The \"devID\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:13: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21591 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21611 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21830 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21995 + Length: 0 + ReplacementText: " /*\n DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22013 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22381 + Length: 0 + ReplacementText: " /*\n DPCT1035:15: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22401 + Length: 76 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22500 + Length: 81 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(current_device).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22604 + Length: 81 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(current_device).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22789 + Length: 0 + ReplacementText: " /*\n DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22812 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23085 + Length: 92 + ReplacementText: '(multiProcessorCount = dpct::dev_mgr::instance().get_device(current_device).get_max_compute_units(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23186 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23207 + Length: 72 + ReplacementText: '(clockRate = dpct::dev_mgr::instance().get_device(current_device).get_max_clock_frequency(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23287 + Length: 473 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24298 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24947 + Length: 0 + ReplacementText: " /*\n DPCT1093:17: The \"devID\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:18: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24967 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24987 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25040 + Length: 72 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25135 + Length: 72 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25490 + Length: 0 + ReplacementText: " /*\n DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25508 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25806 + Length: 0 + ReplacementText: " /*\n DPCT1035:20: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25826 + Length: 76 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25925 + Length: 74 + ReplacementText: '(integrated = dpct::dev_mgr::instance().get_device(current_device).get_integrated(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26119 + Length: 0 + ReplacementText: " /*\n DPCT1035:21: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26157 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26187 + Length: 0 + ReplacementText: " /*\n DPCT1093:22: The \"current_device\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:23: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26209 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26238 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26296 + Length: 81 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(current_device).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26402 + Length: 81 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(current_device).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27117 + Length: 19 + ReplacementText: 'dev = dpct::dev_mgr::instance().current_device_id()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27157 + Length: 70 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(dev).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27248 + Length: 70 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Digest: b335aa967b6d80921de9d6528e1564a5 +DpctVersion: 2023.0.0 +MainHelperFileName: dpct +USMLevel: '' +FeatureMap: + device.hpp: + dev_mgr: + IsCalled: false + FeatureName: '' + SubFeatureMap: + dev_mgr_current_device_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::current_device_id' + SubFeatureMap: {} + dev_mgr_device_count: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::device_count' + SubFeatureMap: {} + dev_mgr_get_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::get_device' + SubFeatureMap: {} + device_ext: + IsCalled: false + FeatureName: '' + SubFeatureMap: + device_ext_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_integrated' + SubFeatureMap: {} + device_ext_get_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_major_version' + SubFeatureMap: {} + device_ext_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_clock_frequency' + SubFeatureMap: {} + device_ext_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_compute_units' + SubFeatureMap: {} + device_ext_get_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_minor_version' + SubFeatureMap: {} + device_ext_queues_wait_and_throw: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::queues_wait_and_throw' + SubFeatureMap: {} + get_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_current_device + SubFeatureMap: {} + get_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_default_queue + SubFeatureMap: {} + select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: select_device + SubFeatureMap: {} + memory.hpp: + constant_memory_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: constant_memory + SubFeatureMap: {} + device_memory: + IsCalled: false + FeatureName: '' + SubFeatureMap: + device_memory_get_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_init: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + dpct_memcpy: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: memcpy_direction + SubFeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/home/tcs/Manjula_workspace/cuda-samples' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable' + Specified: true + CtadEnabled: + Value: 'false' + Specified: false + CustomHelperFileName: + Value: dpct + Specified: false + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_functions.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_functions.h new file mode 100644 index 0000000000..2975ddba6a --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_image.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_image.h new file mode 100644 index 0000000000..9b7edc062c --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_string.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_string.h new file mode 100644 index 0000000000..39a1b38058 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_timer.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_timer.h new file mode 100644 index 0000000000..2fe3207ed9 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Common/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/MainSourceFiles.yaml b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/MainSourceFiles.yaml new file mode 100644 index 0000000000..ca1b0962aa --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/MainSourceFiles.yaml @@ -0,0 +1,1421 @@ +--- +MainSourceFile: MainSrcFiles_placehold +Replacements: + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1563 + Length: 0 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1641 + Length: 34 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1910 + Length: 43 + ReplacementText: 'dpct::constant_memory c_Kernel(KERNEL_LENGTH);' + ConstantFlag: DeviceConstant + ConstantOffset: 1910 + InitStr: '' + NewHostVarName: c_Kernel_host_ct1 + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2013 + Length: 18 + ReplacementText: 'dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2032 + Length: 8 + ReplacementText: 'c_Kernel.get_ptr()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2082 + Length: 0 + ReplacementText: '.wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2383 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2520 + Length: 0 + ReplacementText: ",\n sycl::nd_item<3> item_ct1, float *c_Kernel,\n sycl::local_accessor s_Data" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2560 + Length: 16 + ReplacementText: auto + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2583 + Length: 23 + ReplacementText: 'item_ct1.get_group()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2610 + Length: 138 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2811 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2887 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2920 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2951 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3159 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3172 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3342 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3355 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3643 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3656 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3811 + Length: 0 + ReplacementText: " /*\n DPCT1065:0: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3813 + Length: 13 + ReplacementText: 'item_ct1.barrier()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4083 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4096 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4483 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4495 + Length: 86 + ReplacementText: '1, imageH / ROWS_BLOCKDIM_Y, imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4586 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4599 + Length: 32 + ReplacementText: 1, ROWS_BLOCKDIM_Y, ROWS_BLOCKDIM_X + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4635 + Length: 0 + ReplacementText: " /*\n DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4637 + Length: 125 + ReplacementText: "dpct::get_default_queue().submit(\n [&](sycl::handler &cgh) {\n c_Kernel.init();\n\n auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr();\n\n sycl::local_accessor s_Data_acc_ct1(sycl::range<2>(4/*4*/, 160/*(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) *\n ROWS_BLOCKDIM_X*/), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(blocks * threads, threads), \n [=](sycl::nd_item<3> item_ct1) {\n convolutionRowsKernel(d_Dst, d_Src, imageW, imageH, imageW, item_ct1, c_Kernel_ptr_ct1, s_Data_acc_ct1);\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4762 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5144 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5287 + Length: 0 + ReplacementText: ",\n sycl::nd_item<3> item_ct1,\n float *c_Kernel,\n sycl::local_accessor s_Data" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5327 + Length: 16 + ReplacementText: auto + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5350 + Length: 23 + ReplacementText: 'item_ct1.get_group()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5377 + Length: 261 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5695 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5729 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5763 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5886 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6104 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6117 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6300 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6313 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6659 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6672 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6867 + Length: 0 + ReplacementText: " /*\n DPCT1065:2: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6869 + Length: 13 + ReplacementText: 'item_ct1.barrier()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7154 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7167 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7590 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7602 + Length: 95 + ReplacementText: '1, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y), imageW / COLUMNS_BLOCKDIM_X' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7702 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7715 + Length: 38 + ReplacementText: 1, COLUMNS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_X + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7757 + Length: 0 + ReplacementText: " /*\n DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7759 + Length: 131 + ReplacementText: "dpct::get_default_queue().submit(\n [&](sycl::handler &cgh) {\n c_Kernel.init();\n\n auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr();\n\n sycl::local_accessor s_Data_acc_ct1(sycl::range<2>(16/*16*/, 81/*(COLUMNS_RESULT_STEPS +\n 2 * COLUMNS_HALO_STEPS) *\n COLUMNS_BLOCKDIM_Y +\n 1*/), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(blocks * threads, threads), \n [=](sycl::nd_item<3> item_ct1) {\n convolutionColumnsKernel(d_Dst, d_Src, imageW, imageH, imageW, item_ct1, c_Kernel_ptr_ct1, s_Data_acc_ct1);\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7890 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 1684 + Length: 26 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 1840 + Length: 0 + ReplacementText: "\n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3799 + Length: 0 + ReplacementText: " /*\n DPCT1003:24: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3824 + Length: 62 + ReplacementText: '(d_Input = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3889 + Length: 0 + ReplacementText: " /*\n DPCT1003:25: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3914 + Length: 63 + ReplacementText: '(d_Output = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3980 + Length: 0 + ReplacementText: " /*\n DPCT1003:26: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4005 + Length: 63 + ReplacementText: '(d_Buffer = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4106 + Length: 0 + ReplacementText: " /*\n DPCT1003:27: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4124 + Length: 10 + ReplacementText: '(dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4184 + Length: 53 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4238 + Length: 0 + ReplacementText: '.wait(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4429 + Length: 0 + ReplacementText: " /*\n DPCT1003:28: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4451 + Length: 23 + ReplacementText: '(dpct::get_current_device().queues_wait_and_throw(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4672 + Length: 0 + ReplacementText: " /*\n DPCT1003:29: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4690 + Length: 23 + ReplacementText: '(dpct::get_current_device().queues_wait_and_throw(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5108 + Length: 0 + ReplacementText: " /*\n DPCT1003:30: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5126 + Length: 10 + ReplacementText: '(dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5220 + Length: 53 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5274 + Length: 0 + ReplacementText: '.wait(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5991 + Length: 0 + ReplacementText: " /*\n DPCT1003:31: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6009 + Length: 18 + ReplacementText: '(sycl::free(d_Buffer, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6027 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6030 + Length: 0 + ReplacementText: " /*\n DPCT1003:32: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6048 + Length: 18 + ReplacementText: '(sycl::free(d_Output, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6066 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6069 + Length: 0 + ReplacementText: " /*\n DPCT1003:33: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6087 + Length: 17 + ReplacementText: '(sycl::free(d_Input, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6104 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Digest: a415e6d147e03637b5e283e388bc2d95 + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Digest: 701666f627dc80bcbb7d7e3ae2f7bf55 +DpctVersion: 2023.0.0 +MainHelperFileName: dpct +USMLevel: '' +FeatureMap: + device.hpp: + dev_mgr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: + dev_mgr_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_3: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_4: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_check_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::check_id' + SubFeatureMap: {} + dev_mgr_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::current_device' + SubFeatureMap: {} + dev_mgr_current_device_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::current_device_id' + SubFeatureMap: {} + dev_mgr_device_count: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::device_count' + SubFeatureMap: {} + dev_mgr_get_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::get_device' + SubFeatureMap: {} + dev_mgr_select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::select_device' + SubFeatureMap: {} + device_ext: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: + device_ext_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: {} + device_ext_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: {} + device_ext_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::default_queue' + SubFeatureMap: {} + device_ext_get_device_info_return_info: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_device_info' + SubFeatureMap: {} + device_ext_get_device_info_return_void: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_device_info' + SubFeatureMap: {} + device_ext_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_integrated' + SubFeatureMap: {} + device_ext_get_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_major_version' + SubFeatureMap: {} + device_ext_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_clock_frequency' + SubFeatureMap: {} + device_ext_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_compute_units' + SubFeatureMap: {} + device_ext_get_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_minor_version' + SubFeatureMap: {} + device_ext_get_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_version' + SubFeatureMap: {} + device_ext_queues_wait_and_throw: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::queues_wait_and_throw' + SubFeatureMap: {} + device_info: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: device_info + SubFeatureMap: + device_info_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: device_info + SubFeatureMap: {} + device_info_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_integrated' + SubFeatureMap: {} + device_info_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_max_clock_frequency' + SubFeatureMap: {} + device_info_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_max_compute_units' + SubFeatureMap: {} + device_info_set_global_mem_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_global_mem_size' + SubFeatureMap: {} + device_info_set_host_unified_memory: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_host_unified_memory' + SubFeatureMap: {} + device_info_set_local_mem_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_local_mem_size' + SubFeatureMap: {} + device_info_set_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_major_version' + SubFeatureMap: {} + device_info_set_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_clock_frequency' + SubFeatureMap: {} + device_info_set_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_compute_units' + SubFeatureMap: {} + device_info_set_max_nd_range_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_nd_range_size' + SubFeatureMap: {} + device_info_set_max_sub_group_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_sub_group_size' + SubFeatureMap: {} + device_info_set_max_work_group_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_group_size' + SubFeatureMap: {} + device_info_set_max_work_item_sizes: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_item_sizes' + SubFeatureMap: {} + device_info_set_max_work_items_per_compute_unit: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_items_per_compute_unit' + SubFeatureMap: {} + device_info_set_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_minor_version' + SubFeatureMap: {} + device_info_set_name: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_name' + SubFeatureMap: {} + exception_handler: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: exception_handler + SubFeatureMap: {} + get_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_current_device + SubFeatureMap: {} + get_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_default_queue + SubFeatureMap: {} + get_tid: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_tid + SubFeatureMap: {} + select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: select_device + SubFeatureMap: {} + dpct.hpp: + non_local_include_dependency: + IsCalled: true + CallerSrcFiles: + - '' + FeatureName: '' + SubFeatureMap: {} + memory.hpp: + constant_memory_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: constant_memory + SubFeatureMap: {} + device_memory: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: + device_memory_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_3: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_4: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_5: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_6: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_allocate_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::allocate_device' + SubFeatureMap: {} + device_memory_get_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_get_ptr_q: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_init: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + device_memory_init_q: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + device_memory_value_t_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::value_t' + SubFeatureMap: {} + dpct_free: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: dpct_free + SubFeatureMap: {} + dpct_get_copy_range_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_get_copy_range_detail + SubFeatureMap: {} + dpct_get_offset_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_get_offset_detail + SubFeatureMap: {} + dpct_malloc_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: dpct_malloc + SubFeatureMap: {} + dpct_memcpy: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d_3d_pitch_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d_pitch_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + get_memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_memcpy_direction + SubFeatureMap: {} + get_pointer_attribute: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_pointer_attribute + SubFeatureMap: {} + mem_mgr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: + mem_mgr_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: {} + mem_mgr_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: {} + memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: memcpy_direction + SubFeatureMap: {} + memory_region: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: memory_region + SubFeatureMap: {} + memory_traits: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: memory_traits + SubFeatureMap: {} + pitched_data: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pitched_data + SubFeatureMap: + pitched_data_get_data_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_data_ptr' + SubFeatureMap: {} + pitched_data_get_pitch: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_pitch' + SubFeatureMap: {} + pitched_data_get_y: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_y' + SubFeatureMap: {} + pitched_data_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pitched_data + SubFeatureMap: {} + pointer_access_attribute: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pointer_access_attribute + SubFeatureMap: {} + typedef_buffer_t: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: buffer_t + SubFeatureMap: {} + typedef_byte_t: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: byte_t + SubFeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/home/tcs/Manjula_workspace/cuda-samples' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable' + Specified: true + CtadEnabled: + Value: 'false' + Specified: false + CustomHelperFileName: + Value: dpct + Specified: false + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp new file mode 100644 index 0000000000..fe62db1da6 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp @@ -0,0 +1,276 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "convolutionSeparable_common.h" + +//////////////////////////////////////////////////////////////////////////////// +// Convolution kernel storage +//////////////////////////////////////////////////////////////////////////////// +dpct::constant_memory c_Kernel(KERNEL_LENGTH); + +extern "C" void setConvolutionKernel(float *h_Kernel) { + dpct::get_default_queue() + .memcpy(c_Kernel.get_ptr(), h_Kernel, KERNEL_LENGTH * sizeof(float)) + .wait(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Row convolution filter +//////////////////////////////////////////////////////////////////////////////// +#define ROWS_BLOCKDIM_X 16 +#define ROWS_BLOCKDIM_Y 4 +#define ROWS_RESULT_STEPS 8 +#define ROWS_HALO_STEPS 1 + +void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW, + int imageH, int pitch, + sycl::nd_item<3> item_ct1, float *c_Kernel, + sycl::local_accessor s_Data) { + // Handle to thread block group + auto cta = item_ct1.get_group(); + + // Offset to the left halo edge + const int baseX = + (item_ct1.get_group(2) * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * + ROWS_BLOCKDIM_X + + item_ct1.get_local_id(2); + const int baseY = + item_ct1.get_group(1) * ROWS_BLOCKDIM_Y + item_ct1.get_local_id(1); + + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; + +// Load main data +#pragma unroll + + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X] = + d_Src[i * ROWS_BLOCKDIM_X]; + } + +// Load left halo +#pragma unroll + + for (int i = 0; i < ROWS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X] = + (baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; + } + +// Load right halo +#pragma unroll + + for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; + i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)][item_ct1.get_local_id(2) + + i * ROWS_BLOCKDIM_X] = + (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; + } + + // Compute and store results + /* + DPCT1065:0: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); +#pragma unroll + + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + float sum =0; + +#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += c_Kernel[KERNEL_RADIUS - j] * + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X + j]; + } + d_Dst[i * ROWS_BLOCKDIM_X] = sum; + } +} + +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH) { + assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0); + assert(imageH % ROWS_BLOCKDIM_Y == 0); + + sycl::range<3> blocks(1, imageH / ROWS_BLOCKDIM_Y, + imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X)); + sycl::range<3> threads(1, ROWS_BLOCKDIM_Y, ROWS_BLOCKDIM_X); + + /* + DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + dpct::get_default_queue().submit([&](sycl::handler &cgh) { + c_Kernel.init(); + + auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr(); + + sycl::local_accessor s_Data_acc_ct1(sycl::range<2>( + 4 /*4*/, + 160 /*(ROWS_RESULT_STEPS + + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X*/), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + convolutionRowsKernel(d_Dst, d_Src, imageW, imageH, + imageW, item_ct1, c_Kernel_ptr_ct1, + s_Data_acc_ct1); + }); + }); + getLastCudaError("convolutionRowsKernel() execution failed\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// Column convolution filter +//////////////////////////////////////////////////////////////////////////////// +#define COLUMNS_BLOCKDIM_X 16 +#define COLUMNS_BLOCKDIM_Y 8 +#define COLUMNS_RESULT_STEPS 8 +#define COLUMNS_HALO_STEPS 1 + +void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW, + int imageH, int pitch, + sycl::nd_item<3> item_ct1, + float *c_Kernel, + sycl::local_accessor s_Data) { + // Handle to thread block group + auto cta = item_ct1.get_group(); + + // Offset to the upper halo edge + const int baseX = + item_ct1.get_group(2) * COLUMNS_BLOCKDIM_X + item_ct1.get_local_id(2); + const int baseY = + (item_ct1.get_group(1) * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * + COLUMNS_BLOCKDIM_Y + + item_ct1.get_local_id(1); + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; + +// Main data +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]; + } + +// Upper halo +#pragma unroll + + for (int i = 0; i < COLUMNS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + (baseY >= -i * COLUMNS_BLOCKDIM_Y) + ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] + : 0; + } + +// Lower halo +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; + i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) + ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] + : 0; + } + + // Compute and store results + /* + DPCT1065:2: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + float sum =0; +#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += c_Kernel[KERNEL_RADIUS - j] * + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y + j]; + } + d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; + } +} + +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH) { + assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % COLUMNS_BLOCKDIM_X == 0); + assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0); + + sycl::range<3> blocks(1, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y), + imageW / COLUMNS_BLOCKDIM_X); + sycl::range<3> threads(1, COLUMNS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_X); + + /* + DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + dpct::get_default_queue().submit([&](sycl::handler &cgh) { + c_Kernel.init(); + + auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr(); + + sycl::local_accessor s_Data_acc_ct1( + sycl::range<2>(16 /*16*/, 81 /*(COLUMNS_RESULT_STEPS + + 2 * COLUMNS_HALO_STEPS) * + COLUMNS_BLOCKDIM_Y + + 1*/), cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + convolutionColumnsKernel( + d_Dst, d_Src, imageW, imageH, imageW, item_ct1, + c_Kernel_ptr_ct1, s_Data_acc_ct1); + }); + }); + getLastCudaError("convolutionColumnsKernel() execution failed\n"); +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h new file mode 100644 index 0000000000..9dba4a5a42 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CONVOLUTIONSEPARABLE_COMMON_H +#define CONVOLUTIONSEPARABLE_COMMON_H + +#define KERNEL_RADIUS 8 +#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1) + +//////////////////////////////////////////////////////////////////////////////// +// Reference CPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, + int imageW, int imageH, int kernelR); + +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +//////////////////////////////////////////////////////////////////////////////// +// GPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void setConvolutionKernel(float *h_Kernel); + +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH); + +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH); + +#endif diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp new file mode 100644 index 0000000000..e8a40ca816 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp @@ -0,0 +1,69 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "convolutionSeparable_common.h" + +//////////////////////////////////////////////////////////////////////////////// +// Reference row convolution filter +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, + int imageW, int imageH, int kernelR) { + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; + + for (int k = -kernelR; k <= kernelR; k++) { + int d = x + k; + + if (d >= 0 && d < imageW) + sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; + } + + h_Dst[y * imageW + x] = sum; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Reference column convolution filter +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, + float *h_Kernel, int imageW, int imageH, + int kernelR) { + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; + + for (int k = -kernelR; k <= kernelR; k++) { + int d = y + k; + + if (d >= 0 && d < imageH) + sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; + } + + h_Dst[y * imageW + x] = sum; + } +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp new file mode 100644 index 0000000000..7bb43b3f8a --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp @@ -0,0 +1,226 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +* This sample implements a separable convolution filter +* of a 2D image with an arbitrary kernel. +*/ + +// CUDA runtime +#include +#include + +// Utilities and system includes +#include +#include + +#include "convolutionSeparable_common.h" +#include + +//////////////////////////////////////////////////////////////////////////////// +// Reference CPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Result, float *h_Data, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +extern "C" void convolutionColumnCPU(float *h_Result, float *h_Data, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +//////////////////////////////////////////////////////////////////////////////// +// Main program +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + // start logs + printf("[%s] - Starting...\n", argv[0]); + + float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; + + float *d_Input, *d_Output, *d_Buffer; + + const int imageW = 3072; + const int imageH = 3072; + const int iterations = 16; + + StopWatchInterface *hTimer = NULL; + + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s +// findCudaDevice(argc, (const char **)argv); + std::cout << "\nRunning on " + << dpct::get_default_queue().get_device().get_info() << "\n"; + + sdkCreateTimer(&hTimer); + + printf("Image Width x Height = %i x %i\n\n", imageW, imageH); + printf("Allocating and initializing host arrays...\n"); + h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); + h_Input = (float *)malloc(imageW * imageH * sizeof(float)); + h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); + srand(200); + + for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { + h_Kernel[i] = (float)(rand() % 16); + } + + for (unsigned i = 0; i < imageW * imageH; i++) { + h_Input[i] = (float)(rand() % 16); + } + + printf("Allocating and initializing CUDA arrays...\n"); + /* + DPCT1003:24: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Input = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + /* + DPCT1003:25: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Output = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + /* + DPCT1003:26: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Buffer = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + + setConvolutionKernel(h_Kernel); + /* + DPCT1003:27: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors( + (dpct::get_default_queue() + .memcpy(d_Input, h_Input, imageW * imageH * sizeof(float)) + .wait(), + 0)); + + printf("Running GPU convolution (%u identical iterations)...\n\n", + iterations); + + for (int i = -1; i < iterations; i++) { + // i == -1 -- warmup iteration + if (i == 0) { + /* + DPCT1003:28: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::get_current_device().queues_wait_and_throw(), 0)); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + convolutionRowsGPU(d_Buffer, d_Input, imageW, imageH); + + convolutionColumnsGPU(d_Output, d_Buffer, imageW, imageH); + } + + /* + DPCT1003:29: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((dpct::get_current_device().queues_wait_and_throw(), 0)); + sdkStopTimer(&hTimer); + double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations; + printf( + "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, " + "Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", + (1.0e-6 * (double)(imageW * imageH) / gpuTime), gpuTime, + (imageW * imageH), 1, 0); + + printf("\nReading back GPU results...\n\n"); + /* + DPCT1003:30: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors( + (dpct::get_default_queue() + .memcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float)) + .wait(), + 0)); + + printf("Checking the results...\n"); + printf(" ...running convolutionRowCPU()\n"); + convolutionRowCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); + + printf(" ...running convolutionColumnCPU()\n"); + convolutionColumnCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, + KERNEL_RADIUS); + + printf(" ...comparing the results\n"); + double sum = 0, delta = 0; + + for (unsigned i = 0; i < imageW * imageH; i++) { + delta += + (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); + sum += h_OutputCPU[i] * h_OutputCPU[i]; + } + + double L2norm = sqrt(delta / sum); + printf(" ...Relative L2 norm: %E\n\n", L2norm); + printf("Shutting down...\n"); + + /* + DPCT1003:31: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Buffer, dpct::get_default_queue()), 0)); + /* + DPCT1003:32: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Output, dpct::get_default_queue()), 0)); + /* + DPCT1003:33: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Input, dpct::get_default_queue()), 0)); + free(h_OutputGPU); + free(h_OutputCPU); + free(h_Buffer); + free(h_Input); + free(h_Kernel); + + sdkDeleteTimer(&hTimer); + + if (L2norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/device.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/device.hpp new file mode 100644 index 0000000000..c34711c142 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/device.hpp @@ -0,0 +1,388 @@ +//==---- device.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_DEVICE_HPP__ +#define __DPCT_DEVICE_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__linux__) +#include +#include +#endif +#if defined(_WIN64) +#define NOMINMAX +#include +#endif + + +namespace dpct { + +/// SYCL default exception handler +auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cerr << "Caught asynchronous SYCL exception:" << std::endl + << e.what() << std::endl + << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + } + } +}; + +class device_info { +public: + + int get_integrated() const { return _integrated; } + + int get_max_clock_frequency() const { return _frequency; } + + int get_max_compute_units() const { return _max_compute_units; } + + void set_name(const char* name) { + size_t length = strlen(name); + if (length < 256) { + std::memcpy(_name, name, length + 1); + } else { + std::memcpy(_name, name, 255); + _name[255] = '\0'; + } + } + + void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) { + _max_work_item_sizes = max_work_item_sizes; + for (int i = 0; i < 3; ++i) + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + + void set_host_unified_memory(bool host_unified_memory) { + _host_unified_memory = host_unified_memory; + } + + void set_major_version(int major) { _major = major; } + + void set_minor_version(int minor) { _minor = minor; } + + void set_max_clock_frequency(int frequency) { _frequency = frequency; } + + void set_max_compute_units(int max_compute_units) { + _max_compute_units = max_compute_units; + } + + void set_global_mem_size(size_t global_mem_size) { + _global_mem_size = global_mem_size; + } + + void set_local_mem_size(size_t local_mem_size) { + _local_mem_size = local_mem_size; + } + + void set_max_work_group_size(int max_work_group_size) { + _max_work_group_size = max_work_group_size; + } + + void set_max_sub_group_size(int max_sub_group_size) { + _max_sub_group_size = max_sub_group_size; + } + + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) { + _max_work_items_per_compute_unit = max_work_items_per_compute_unit; + } + + void set_max_nd_range_size(int max_nd_range_size[]) { + for (int i = 0; i < 3; i++) { + _max_nd_range_size[i] = max_nd_range_size[i]; + _max_nd_range_size_i[i] = max_nd_range_size[i]; + } + } + +private: + char _name[256]; + sycl::id<3> _max_work_item_sizes; + int _max_work_item_sizes_i[3]; + bool _host_unified_memory = false; + int _major; + int _minor; + int _integrated = 0; + int _frequency; + int _max_compute_units; + int _max_work_group_size; + int _max_sub_group_size; + int _max_work_items_per_compute_unit; + size_t _global_mem_size; + size_t _local_mem_size; + size_t _max_nd_range_size[3]; + int _max_nd_range_size_i[3]; +}; + +/// dpct device extension +class device_ext : public sycl::device { +public: + device_ext() : sycl::device(), _ctx(*this) {} + ~device_ext() { + std::lock_guard lock(m_mutex); + for (auto &task : _tasks) { + if (task.joinable()) + task.join(); + } + _tasks.clear(); + _queues.clear(); + } + device_ext(const sycl::device &base) + : sycl::device(base), _ctx(*this) { + _queues.push_back(std::make_shared( + _ctx, base, exception_handler, sycl::property::queue::in_order())); + _saved_queue = _default_queue = _queues[0].get(); + } + + int get_major_version() const { + int major, minor; + get_version(major, minor); + return major; + } + + int get_minor_version() const { + int major, minor; + get_version(major, minor); + return minor; + } + + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } + + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } + + int get_integrated() const { return get_device_info().get_integrated(); } + + void get_device_info(device_info &out) const { + device_info prop; + prop.set_name(get_info().c_str()); + + int major, minor; + get_version(major, minor); + prop.set_major_version(major); + prop.set_minor_version(minor); + + prop.set_max_work_item_sizes( +#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION<20220902) + // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes is an enum class element + get_info()); +#else + // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by an int + get_info>()); +#endif + prop.set_host_unified_memory( + this->has(sycl::aspect::usm_host_allocations)); + + prop.set_max_clock_frequency( + get_info()); + + prop.set_max_compute_units( + get_info()); + prop.set_max_work_group_size( + get_info()); + prop.set_global_mem_size( + get_info()); + prop.set_local_mem_size(get_info()); + + size_t max_sub_group_size = 1; + std::vector sub_group_sizes = + get_info(); + + for (const auto &sub_group_size : sub_group_sizes) { + if (max_sub_group_size < sub_group_size) + max_sub_group_size = sub_group_size; + } + + prop.set_max_sub_group_size(max_sub_group_size); + + prop.set_max_work_items_per_compute_unit( + get_info()); + int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + prop.set_max_nd_range_size(max_nd_range_size); + + out = prop; + } + + device_info get_device_info() const { + device_info prop; + get_device_info(prop); + return prop; + } + + sycl::queue &default_queue() { return *_default_queue; } + + void queues_wait_and_throw() { + std::unique_lock lock(m_mutex); + std::vector> current_queues( + _queues); + lock.unlock(); + for (const auto &q : current_queues) { + q->wait_and_throw(); + } + // Guard the destruct of current_queues to make sure the ref count is safe. + lock.lock(); + } + +private: + + void get_version(int &major, int &minor) const { + // Version string has the following format: + // a. OpenCL + // b. + std::string ver; + ver = get_info(); + std::string::size_type i = 0; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; + } + major = std::stoi(&(ver[i])); + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + i++; + minor = std::stoi(&(ver[i])); + } + + sycl::queue *_default_queue; + sycl::queue *_saved_queue; + sycl::context _ctx; + std::vector> _queues; + mutable std::mutex m_mutex; + std::vector _tasks; +}; + +static inline unsigned int get_tid() { +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(_WIN64) + return GetCurrentThreadId(); +#else +#error "Only support Windows and Linux." +#endif +} + +/// device manager +class dev_mgr { + +public: + + device_ext ¤t_device() { + unsigned int dev_id=current_device_id(); + check_id(dev_id); + return *_devs[dev_id]; + } + + device_ext &get_device(unsigned int id) const { + std::lock_guard lock(m_mutex); + check_id(id); + return *_devs[id]; + } + + unsigned int current_device_id() const { + std::lock_guard lock(m_mutex); + auto it=_thread2dev_map.find(get_tid()); + if(it != _thread2dev_map.end()) + return it->second; + return DEFAULT_DEVICE_ID; + } + + void select_device(unsigned int id) { + std::lock_guard lock(m_mutex); + check_id(id); + _thread2dev_map[get_tid()]=id; + } + + unsigned int device_count() { return _devs.size(); } + + /// Returns the instance of device manager singleton. + static dev_mgr &instance() { + static dev_mgr d_m; + return d_m; + } + dev_mgr(const dev_mgr &) = delete; + dev_mgr &operator=(const dev_mgr &) = delete; + dev_mgr(dev_mgr &&) = delete; + dev_mgr &operator=(dev_mgr &&) = delete; + +private: + mutable std::mutex m_mutex; + dev_mgr() { + sycl::device default_device = + sycl::device(sycl::default_selector_v); + _devs.push_back(std::make_shared(default_device)); + + std::vector sycl_all_devs = + sycl::device::get_devices(sycl::info::device_type::all); + // Collect other devices except for the default device. + if (default_device.is_cpu()) + _cpu_device = 0; + for (auto &dev : sycl_all_devs) { + if (dev == default_device) { + continue; + } + _devs.push_back(std::make_shared(dev)); + if (_cpu_device == -1 && dev.is_cpu()) { + _cpu_device = _devs.size() - 1; + } + } + } + + void check_id(unsigned int id) const { + if (id >= _devs.size()) { + throw std::runtime_error("invalid device id"); + } + } + + std::vector> _devs; + /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current + /// thread id in _thread2dev_map, which means default device should be used + /// for the current thread. + const unsigned int DEFAULT_DEVICE_ID = 0; + /// thread-id to device-id map. + std::map _thread2dev_map; + int _cpu_device = -1; +}; + +/// Util function to get the default queue of current device in +/// dpct device manager. +static inline sycl::queue &get_default_queue() { + return dev_mgr::instance().current_device().default_queue(); +} + +/// Util function to get the current device. +static inline device_ext &get_current_device() { + return dev_mgr::instance().current_device(); +} + +static inline unsigned int select_device(unsigned int id){ + dev_mgr::instance().select_device(id); + return id; +} + +} // namespace dpct + +#endif // __DPCT_DEVICE_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/dpct.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/dpct.hpp new file mode 100644 index 0000000000..3099386e04 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/dpct.hpp @@ -0,0 +1,19 @@ +//==---- dpct.hpp ---------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_HPP__ +#define __DPCT_HPP__ + +#include +#include +#include + +#include "device.hpp" +#include "memory.hpp" + +#endif // __DPCT_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/memory.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/memory.hpp new file mode 100644 index 0000000000..75bf8ec102 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/01_dpct_output/include/dpct/memory.hpp @@ -0,0 +1,600 @@ +//==---- memory.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_MEMORY_HPP__ +#define __DPCT_MEMORY_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#elif defined(_WIN64) +#define NOMINMAX +#include +#else +#error "Only support Windows and Linux." +#endif + +#include "device.hpp" + +namespace dpct { + +enum memcpy_direction { + host_to_host, + host_to_device, + device_to_host, + device_to_device, + automatic +}; + +enum memory_region { + global = 0, // device global memory + constant, // device constant memory + local, // device local memory + shared, // memory which can be accessed by host and device +}; + +typedef uint8_t byte_t; + +/// Buffer type to be used in Memory Management runtime. +typedef sycl::buffer buffer_t; + +/// Pitched 2D/3D memory data. +class pitched_data { +public: + pitched_data() : pitched_data(nullptr, 0, 0, 0) {} + pitched_data(void *data, size_t pitch, size_t x, size_t y) + : _data(data), _pitch(pitch), _x(x), _y(y) {} + + void *get_data_ptr() { return _data; } + + size_t get_pitch() { return _pitch; } + + size_t get_y() { return _y; } + +private: + void *_data; + size_t _pitch, _x, _y; +}; + +namespace detail { + +class mem_mgr { + mem_mgr() { + // Reserved address space, no real memory allocation happens here. +#if defined(__linux__) + mapped_address_space = + (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#elif defined(_WIN64) + mapped_address_space = (byte_t *)VirtualAlloc( + NULL, // NULL specified as the base address parameter + mapped_region_size, // Size of allocation + MEM_RESERVE, // Allocate reserved pages + PAGE_NOACCESS); // Protection = no access +#else +#error "Only support Windows and Linux." +#endif + next_free = mapped_address_space; + }; + +public: + using buffer_id_t = int; + + struct allocation { + buffer_t buffer; + byte_t *alloc_ptr; + size_t size; + }; + + ~mem_mgr() { +#if defined(__linux__) + munmap(mapped_address_space, mapped_region_size); +#elif defined(_WIN64) + VirtualFree(mapped_address_space, 0, MEM_RELEASE); +#else +#error "Only support Windows and Linux." +#endif + }; + + mem_mgr(const mem_mgr &) = delete; + mem_mgr &operator=(const mem_mgr &) = delete; + mem_mgr(mem_mgr &&) = delete; + mem_mgr &operator=(mem_mgr &&) = delete; + + /// Returns the instance of memory manager singleton. + static mem_mgr &instance() { + static mem_mgr m; + return m; + } + +private: + std::map m_map; + mutable std::mutex m_mutex; + byte_t *mapped_address_space; + byte_t *next_free; + const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; + const size_t alignment = 256; + /// This padding may be defined to some positive value to debug + /// out of bound accesses. + const size_t extra_padding = 0; + +}; + +template class accessor; +template class memory_traits { +public: + static constexpr sycl::access::target target = + (Memory == constant) ? sycl::access::target::constant_buffer + : sycl::access::target::device; + static constexpr sycl::access_mode mode = + (Memory == constant) ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional::type; + using value_t = typename std::remove_cv::type; + template + using accessor_t = typename std::conditional< + Memory == local, sycl::local_accessor, + sycl::accessor>::type; + using pointer_t = T *; +}; + +static inline void *dpct_malloc(size_t size, sycl::queue &q) { + return sycl::malloc_device(size, q.get_device(), q.get_context()); +} + +enum class pointer_access_attribute { + host_only = 0, + device_only, + host_device, + end +}; + +static pointer_access_attribute get_pointer_attribute(sycl::queue &q, + const void *ptr) { + switch (sycl::get_pointer_type(ptr, q.get_context())) { + case sycl::usm::alloc::unknown: + return pointer_access_attribute::host_only; + case sycl::usm::alloc::device: + return pointer_access_attribute::device_only; + case sycl::usm::alloc::shared: + case sycl::usm::alloc::host: + return pointer_access_attribute::host_device; + } +} + +static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, + const void *from_ptr, + memcpy_direction dir) { + switch (dir) { + case memcpy_direction::host_to_host: + case memcpy_direction::host_to_device: + case memcpy_direction::device_to_host: + case memcpy_direction::device_to_device: + return dir; + case memcpy_direction::automatic: { + // table[to_attribute][from_attribute] + static const memcpy_direction + direction_table[static_cast(pointer_access_attribute::end)] + [static_cast(pointer_access_attribute::end)] = + {{memcpy_direction::host_to_host, + memcpy_direction::device_to_host, + memcpy_direction::host_to_host}, + {memcpy_direction::host_to_device, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}, + {memcpy_direction::host_to_host, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}}; + return direction_table[static_cast(get_pointer_attribute( + q, to_ptr))][static_cast(get_pointer_attribute(q, from_ptr))]; + } + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } +} + +static sycl::event +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector &dep_events = {}) { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); +} + +// Get actual copy range and make sure it will not exceed range. +static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); +} + +static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); +} + +/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr +/// and \p from_range to another specified by \p to_ptr and \p to_range. +static inline std::vector +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector &dep_events = {}) { + // RAII for host pointer + class host_buffer { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() { + if (_buf) { + _q.submit([&](sycl::handler &cgh) { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); + }); + } + } + }; + std::vector event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } else { + for (size_t y = 0; y < size.get(1); ++y) { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector host_events; + if (to_slice == size_slice) { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } else { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); + })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; +} + +/// memcpy 2D/3D matrix specified by pitched_data. +static inline std::vector +dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); +} + +/// memcpy 2D matrix with pitch. +static inline std::vector +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); +} + +} // namespace detail + +/// free +/// \param ptr Point to free. +/// \param q Queue to execute the free task. +/// \returns no return value. +static inline void dpct_free(void *ptr, + sycl::queue &q = get_default_queue()) { + if (ptr) { + sycl::free(ptr, q.get_context()); + } +} + +/// Synchronously copies \p size bytes from the address specified by \p from_ptr +/// to the address specified by \p to_ptr. The value of \p direction is used to +/// set the copy direction, it can be \a host_to_host, \a host_to_device, +/// \a device_to_host, \a device_to_device or \a automatic. The function will +/// return after the copy is completed. +/// +/// \param to_ptr Pointer to destination memory address. +/// \param from_ptr Pointer to source memory address. +/// \param size Number of bytes to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static void dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction = automatic, + sycl::queue &q = get_default_queue()) { + detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction).wait(); +} + +/// Synchronously copies 2D matrix specified by \p x and \p y from the address +/// specified by \p from_ptr to the address specified by \p to_ptr, while \p +/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix +/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to +/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a +/// device_to_host, \a device_to_device or \a automatic. The function will +/// return after the copy is completed. +/// +/// \param to_ptr Pointer to destination memory address. +/// \param to_pitch Range of dim x in bytes of destination matrix. +/// \param from_ptr Pointer to source memory address. +/// \param from_pitch Range of dim x in bytes of source matrix. +/// \param x Range of dim x of matrix to be copied. +/// \param y Range of dim y of matrix to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void dpct_memcpy(void *to_ptr, size_t to_pitch, + const void *from_ptr, size_t from_pitch, + size_t x, size_t y, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) { + sycl::event::wait(detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, + from_pitch, x, y, direction)); +} + +/// Synchronously copies a subset of a 3D matrix specified by \p to to another +/// 3D matrix specified by \p from. The from and to position info are specified +/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size. +/// The value of \p direction is used to set the copy direction, it can be \a +/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or +/// \a automatic. The function will return after the copy is completed. +/// +/// \param to Destination matrix info. +/// \param to_pos Position of destination. +/// \param from Source matrix info. +/// \param from_pos Position of destination. +/// \param size Range of the submatrix to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void dpct_memcpy(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) { + sycl::event::wait( + detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction)); +} + +namespace detail { + +/// Device variable with address space of shared, global or constant. +template +class device_memory { +public: + + using value_t = typename detail::memory_traits::value_t; + + device_memory() : device_memory(sycl::range(1)) {} + + /// Constructor of 1-D array with initializer list + device_memory( + const sycl::range &in_range, + std::initializer_list &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range.size()); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); + } + + /// Constructor of 2-D array with initializer list + template + device_memory( + const typename std::enable_if>::type &in_range, + std::initializer_list> &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range[0]); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + auto tmp_data = _host_ptr; + for (auto sub_list : init_list) { + assert(sub_list.size() <= in_range[1]); + std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T)); + tmp_data += in_range[1]; + } + } + + /// Constructor with range + device_memory(const sycl::range &range_in) + : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false), + _host_ptr(nullptr), _device_ptr(nullptr) { + static_assert( + (Memory == global) || (Memory == constant) || (Memory == shared), + "device memory region should be global, constant or shared"); + // Make sure that singleton class mem_mgr and dev_mgr will destruct later + // than this. + detail::mem_mgr::instance(); + dev_mgr::instance(); + } + + /// Constructor with range + template + device_memory(Args... Arguments) + : device_memory(sycl::range(Arguments...)) {} + + ~device_memory() { + if (_device_ptr && !_reference) + dpct_free(_device_ptr); + if (_host_ptr) + std::free(_host_ptr); + } + + /// Allocate memory with default queue, and init memory if has initial value. + void init() { + init(dpct::get_default_queue()); + } + + /// Allocate memory with specified queue, and init memory if has initial value. + void init(sycl::queue &q) { + if (_device_ptr) + return; + if (!_size) + return; + allocate_device(q); + if (_host_ptr) + detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, host_to_device); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr() { + return get_ptr(get_default_queue()); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue &q) { + init(q); + return _device_ptr; + } + + template + typename std::enable_if::type &operator[](size_t index) { + init(); + return _device_ptr[index]; + } + +private: + device_memory(value_t *memory_ptr, size_t size) + : _size(size), _range(size / sizeof(T)), _reference(true), + _device_ptr(memory_ptr) {} + + void allocate_device(sycl::queue &q) { + if (Memory == shared) { + _device_ptr = (value_t *)sycl::malloc_shared( + _size, q.get_device(), q.get_context()); + return; + } + _device_ptr = (value_t *)detail::dpct_malloc(_size, q); + } + + size_t _size; + sycl::range _range; + bool _reference; + value_t *_host_ptr; + value_t *_device_ptr; +}; +template +class device_memory : public device_memory { +public: + using base = device_memory; + using value_t = typename base::value_t; + + /// Constructor with initial value. + device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} + + /// Default constructor + device_memory() : base(1) {} + +}; + +} // namespace detail + +template +using constant_memory = detail::device_memory; + +} // namespace dpct + +#endif // __DPCT_MEMORY_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/CMakeLists.txt b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/CMakeLists.txt new file mode 100644 index 0000000000..cedc9c5e59 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/CMakeLists.txt @@ -0,0 +1,9 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +include_directories(${CMAKE_SOURCE_DIR}/02_sycl_migrated_optimized/Common/) +include_directories(${CMAKE_SOURCE_DIR}/02_sycl_migrated_optimized/include/) + +add_executable (02_sycl_migrated_optimized Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp) +target_link_libraries(02_sycl_migrated_optimized sycl) + +add_custom_target (run_smo cd ${CMAKE_SOURCE_DIR}/02_sycl_migrated_optimized/ && ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/02_sycl_migrated_optimized) diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/exception.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/exception.h new file mode 100644 index 0000000000..ca8ac25258 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h new file mode 100644 index 0000000000..93ddee5248 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h @@ -0,0 +1,1053 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DPCT_HPP__ +static const char *_cudaGetErrorEnum(int error) { + /* + DPCT1009:4: SYCL uses exceptions to report errors and does not use the error + codes. The original code was commented out and a warning string was inserted. + You need to rewrite this code. + */ + return "cudaGetErrorName is not supported" /*cudaGetErrorName(error)*/; +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(int error) { + switch (error) { + case 0: + return "CURAND_STATUS_SUCCESS"; + + case 100: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case 101: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case 102: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case 103: + return "CURAND_STATUS_TYPE_ERROR"; + + case 104: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case 105: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case 106: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case 201: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case 202: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case 203: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case 204: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case 999: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { +} + +#ifdef __DPCT_HPP__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:5: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + int err = 0; +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + /* + DPCT1010:7: SYCL uses exceptions to report errors and does not use the error + codes. The call was replaced with 0. You need to rewrite this code. + */ + int err = 0; +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct dpct_type_771618 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x89, 128}, + {0x90, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct dpct_type_506579 { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x89, "Ada"}, + {0x90, "Hopper"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __DPCT_HPP__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + /* + DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:10: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors( + (major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), + 0)); + checkCudaErrors( + (minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), + 0)); + /* + DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (computeMode == 0) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + /* + DPCT1093:12: The "devID" may not be the best XPU device. Adjust the selected + device if needed. + */ + /* + DPCT1003:13: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(devID), 0)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() try { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + /* + DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + /* + DPCT1035:15: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors((major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version(), + 0)); + checkCudaErrors((minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version(), + 0)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + /* + DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (computeMode != 0) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors((multiProcessorCount = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_compute_units(), + 0)); + int result = (clockRate = dpct::dev_mgr::instance() + .get_device(current_device) + .get_max_clock_frequency(), + 0); + + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + /* + DPCT1093:17: The "devID" may not be the best XPU device. Adjust the selected + device if needed. + */ + /* + DPCT1003:18: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(devID), 0)); + int major = 0, minor = 0; + checkCudaErrors(( + major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), + 0)); + checkCudaErrors(( + minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), + 0)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + /* + DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((device_count = dpct::dev_mgr::instance().device_count(), 0)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + /* + DPCT1035:20: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + checkCudaErrors((computeMode = 1, 0)); + checkCudaErrors((integrated = dpct::dev_mgr::instance() + .get_device(current_device) + .get_integrated(), + 0)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + /* + DPCT1035:21: All SYCL devices can be used by host to submit tasks. You may + need to adjust this code. + */ + if (integrated && (computeMode != 0)) { + /* + DPCT1093:22: The "current_device" may not be the best XPU device. Adjust + the selected device if needed. + */ + /* + DPCT1003:23: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::select_device(current_device), 0)); + + int major = 0, minor = 0; + checkCudaErrors((major = dpct::dev_mgr::instance() + .get_device(current_device) + .get_major_version(), + 0)); + checkCudaErrors((minor = dpct::dev_mgr::instance() + .get_device(current_device) + .get_minor_version(), + 0)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(dev = dpct::dev_mgr::instance().current_device_id()); + checkCudaErrors( + (major = dpct::dev_mgr::instance().get_device(dev).get_major_version(), + 0)); + checkCudaErrors( + (minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(), + 0)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h.yaml b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h.yaml new file mode 100644 index 0000000000..bcf5416a0d --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_cuda.h.yaml @@ -0,0 +1,889 @@ +--- +MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/dpct_output/Common/helper_cuda.h' +Replacements: + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 1793 + Length: 0 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2210 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2266 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2287 + Length: 0 + ReplacementText: " /*\n DPCT1009:4: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 2296 + Length: 23 + ReplacementText: '"cudaGetErrorName is not supported"/*cudaGetErrorName(error)*/' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7109 + Length: 14 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7161 + Length: 21 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7232 + Length: 30 + ReplacementText: '100' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7321 + Length: 29 + ReplacementText: '101' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7408 + Length: 31 + ReplacementText: '102' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7499 + Length: 24 + ReplacementText: '103' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7576 + Length: 26 + ReplacementText: '104' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7657 + Length: 33 + ReplacementText: '105' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7752 + Length: 39 + ReplacementText: '106' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7859 + Length: 28 + ReplacementText: '201' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 7944 + Length: 33 + ReplacementText: '202' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8039 + Length: 35 + ReplacementText: '203' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8138 + Length: 27 + ReplacementText: '204' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 8221 + Length: 28 + ReplacementText: '999' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 15785 + Length: 199 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 15995 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16461 + Length: 0 + ReplacementText: " /*\n DPCT1010:5: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16463 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16481 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 16504 + Length: 259 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17099 + Length: 0 + ReplacementText: " /*\n DPCT1010:7: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17101 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17119 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17142 + Length: 235 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 17829 + Length: 0 + ReplacementText: ' dpct_type_771618' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 19036 + Length: 0 + ReplacementText: ' dpct_type_506579' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20194 + Length: 18 + ReplacementText: __DPCT_HPP__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20313 + Length: 0 + ReplacementText: " /*\n DPCT1003:9: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20331 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20960 + Length: 0 + ReplacementText: " /*\n DPCT1035:10: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 20978 + Length: 67 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21066 + Length: 72 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21159 + Length: 72 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21234 + Length: 0 + ReplacementText: " /*\n DPCT1035:11: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21255 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21573 + Length: 0 + ReplacementText: " /*\n DPCT1093:12: The \"devID\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:13: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21591 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21611 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21830 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 21995 + Length: 0 + ReplacementText: " /*\n DPCT1003:14: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22013 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22381 + Length: 0 + ReplacementText: " /*\n DPCT1035:15: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22401 + Length: 76 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22500 + Length: 81 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(current_device).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22604 + Length: 81 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(current_device).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22789 + Length: 0 + ReplacementText: " /*\n DPCT1035:16: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 22812 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23085 + Length: 92 + ReplacementText: '(multiProcessorCount = dpct::dev_mgr::instance().get_device(current_device).get_max_compute_units(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23186 + Length: 11 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23207 + Length: 72 + ReplacementText: '(clockRate = dpct::dev_mgr::instance().get_device(current_device).get_max_clock_frequency(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 23287 + Length: 473 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24298 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24947 + Length: 0 + ReplacementText: " /*\n DPCT1093:17: The \"devID\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:18: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24967 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 24987 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25040 + Length: 72 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(devID).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25135 + Length: 72 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(devID).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25490 + Length: 0 + ReplacementText: " /*\n DPCT1003:19: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25508 + Length: 33 + ReplacementText: '(device_count = dpct::dev_mgr::instance().device_count(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25806 + Length: 0 + ReplacementText: " /*\n DPCT1035:20: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25826 + Length: 76 + ReplacementText: '(computeMode = 1, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 25925 + Length: 74 + ReplacementText: '(integrated = dpct::dev_mgr::instance().get_device(current_device).get_integrated(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26119 + Length: 0 + ReplacementText: " /*\n DPCT1035:21: All SYCL devices can be used by host to submit tasks. You may need to adjust this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26157 + Length: 25 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26187 + Length: 0 + ReplacementText: " /*\n DPCT1093:22: The \"current_device\" may not be the best XPU device. Adjust the selected device if needed.\n */\n /*\n DPCT1003:23: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26209 + Length: 13 + ReplacementText: '(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26238 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26296 + Length: 81 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(current_device).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 26402 + Length: 81 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(current_device).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27117 + Length: 19 + ReplacementText: 'dev = dpct::dev_mgr::instance().current_device_id()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27157 + Length: 70 + ReplacementText: '(major = dpct::dev_mgr::instance().get_device(dev).get_major_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Offset: 27248 + Length: 70 + ReplacementText: '(minor = dpct::dev_mgr::instance().get_device(dev).get_minor_version(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + Digest: b335aa967b6d80921de9d6528e1564a5 +DpctVersion: 2023.0.0 +MainHelperFileName: dpct +USMLevel: '' +FeatureMap: + device.hpp: + dev_mgr: + IsCalled: false + FeatureName: '' + SubFeatureMap: + dev_mgr_current_device_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::current_device_id' + SubFeatureMap: {} + dev_mgr_device_count: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::device_count' + SubFeatureMap: {} + dev_mgr_get_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::get_device' + SubFeatureMap: {} + device_ext: + IsCalled: false + FeatureName: '' + SubFeatureMap: + device_ext_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_integrated' + SubFeatureMap: {} + device_ext_get_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_major_version' + SubFeatureMap: {} + device_ext_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_clock_frequency' + SubFeatureMap: {} + device_ext_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_compute_units' + SubFeatureMap: {} + device_ext_get_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_minor_version' + SubFeatureMap: {} + device_ext_queues_wait_and_throw: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::queues_wait_and_throw' + SubFeatureMap: {} + get_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_current_device + SubFeatureMap: {} + get_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_default_queue + SubFeatureMap: {} + select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: select_device + SubFeatureMap: {} + memory.hpp: + constant_memory_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: constant_memory + SubFeatureMap: {} + device_memory: + IsCalled: false + FeatureName: '' + SubFeatureMap: + device_memory_get_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_init: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + dpct_memcpy: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: memcpy_direction + SubFeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/home/tcs/Manjula_workspace/cuda-samples' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable' + Specified: true + CtadEnabled: + Value: 'false' + Specified: false + CustomHelperFileName: + Value: dpct + Specified: false + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_functions.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_functions.h new file mode 100644 index 0000000000..2975ddba6a --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_image.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_image.h new file mode 100644 index 0000000000..9b7edc062c --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_string.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_string.h new file mode 100644 index 0000000000..39a1b38058 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_timer.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_timer.h new file mode 100644 index 0000000000..2fe3207ed9 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Common/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/MainSourceFiles.yaml b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/MainSourceFiles.yaml new file mode 100644 index 0000000000..ca1b0962aa --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/MainSourceFiles.yaml @@ -0,0 +1,1421 @@ +--- +MainSourceFile: MainSrcFiles_placehold +Replacements: + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1563 + Length: 0 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1641 + Length: 34 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 1910 + Length: 43 + ReplacementText: 'dpct::constant_memory c_Kernel(KERNEL_LENGTH);' + ConstantFlag: DeviceConstant + ConstantOffset: 1910 + InitStr: '' + NewHostVarName: c_Kernel_host_ct1 + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2013 + Length: 18 + ReplacementText: 'dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2032 + Length: 8 + ReplacementText: 'c_Kernel.get_ptr()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2082 + Length: 0 + ReplacementText: '.wait()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2383 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2520 + Length: 0 + ReplacementText: ",\n sycl::nd_item<3> item_ct1, float *c_Kernel,\n sycl::local_accessor s_Data" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2560 + Length: 16 + ReplacementText: auto + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2583 + Length: 23 + ReplacementText: 'item_ct1.get_group()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2610 + Length: 138 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2811 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2887 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2920 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 2951 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3159 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3172 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3342 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3355 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3643 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3656 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3811 + Length: 0 + ReplacementText: " /*\n DPCT1065:0: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 3813 + Length: 13 + ReplacementText: 'item_ct1.barrier()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4083 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4096 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4483 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4495 + Length: 86 + ReplacementText: '1, imageH / ROWS_BLOCKDIM_Y, imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4586 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4599 + Length: 32 + ReplacementText: 1, ROWS_BLOCKDIM_Y, ROWS_BLOCKDIM_X + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4635 + Length: 0 + ReplacementText: " /*\n DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4637 + Length: 125 + ReplacementText: "dpct::get_default_queue().submit(\n [&](sycl::handler &cgh) {\n c_Kernel.init();\n\n auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr();\n\n sycl::local_accessor s_Data_acc_ct1(sycl::range<2>(4/*4*/, 160/*(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) *\n ROWS_BLOCKDIM_X*/), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(blocks * threads, threads), \n [=](sycl::nd_item<3> item_ct1) {\n convolutionRowsKernel(d_Dst, d_Src, imageW, imageH, imageW, item_ct1, c_Kernel_ptr_ct1, s_Data_acc_ct1);\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 4762 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5144 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5287 + Length: 0 + ReplacementText: ",\n sycl::nd_item<3> item_ct1,\n float *c_Kernel,\n sycl::local_accessor s_Data" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5327 + Length: 16 + ReplacementText: auto + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5350 + Length: 23 + ReplacementText: 'item_ct1.get_group()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5377 + Length: 261 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5695 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5729 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5763 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 5886 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6104 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6117 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6300 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6313 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6659 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6672 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6867 + Length: 0 + ReplacementText: " /*\n DPCT1065:2: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 6869 + Length: 13 + ReplacementText: 'item_ct1.barrier()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7154 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7167 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7590 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7602 + Length: 95 + ReplacementText: '1, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y), imageW / COLUMNS_BLOCKDIM_X' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7702 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7715 + Length: 38 + ReplacementText: 1, COLUMNS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_X + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7757 + Length: 0 + ReplacementText: " /*\n DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7759 + Length: 131 + ReplacementText: "dpct::get_default_queue().submit(\n [&](sycl::handler &cgh) {\n c_Kernel.init();\n\n auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr();\n\n sycl::local_accessor s_Data_acc_ct1(sycl::range<2>(16/*16*/, 81/*(COLUMNS_RESULT_STEPS +\n 2 * COLUMNS_HALO_STEPS) *\n COLUMNS_BLOCKDIM_Y +\n 1*/), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(blocks * threads, threads), \n [=](sycl::nd_item<3> item_ct1) {\n convolutionColumnsKernel(d_Dst, d_Src, imageW, imageH, imageW, item_ct1, c_Kernel_ptr_ct1, s_Data_acc_ct1);\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Offset: 7890 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 1684 + Length: 26 + ReplacementText: "#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 1840 + Length: 0 + ReplacementText: "\n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3799 + Length: 0 + ReplacementText: " /*\n DPCT1003:24: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3824 + Length: 62 + ReplacementText: '(d_Input = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3889 + Length: 0 + ReplacementText: " /*\n DPCT1003:25: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3914 + Length: 63 + ReplacementText: '(d_Output = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 3980 + Length: 0 + ReplacementText: " /*\n DPCT1003:26: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4005 + Length: 63 + ReplacementText: '(d_Buffer = sycl::malloc_device(imageW * imageH, dpct::get_default_queue()), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4106 + Length: 0 + ReplacementText: " /*\n DPCT1003:27: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4124 + Length: 10 + ReplacementText: '(dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4184 + Length: 53 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4238 + Length: 0 + ReplacementText: '.wait(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4429 + Length: 0 + ReplacementText: " /*\n DPCT1003:28: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4451 + Length: 23 + ReplacementText: '(dpct::get_current_device().queues_wait_and_throw(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4672 + Length: 0 + ReplacementText: " /*\n DPCT1003:29: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 4690 + Length: 23 + ReplacementText: '(dpct::get_current_device().queues_wait_and_throw(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5108 + Length: 0 + ReplacementText: " /*\n DPCT1003:30: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5126 + Length: 10 + ReplacementText: '(dpct::get_default_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5220 + Length: 53 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5274 + Length: 0 + ReplacementText: '.wait(), 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 5991 + Length: 0 + ReplacementText: " /*\n DPCT1003:31: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6009 + Length: 18 + ReplacementText: '(sycl::free(d_Buffer, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6027 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6030 + Length: 0 + ReplacementText: " /*\n DPCT1003:32: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6048 + Length: 18 + ReplacementText: '(sycl::free(d_Output, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6066 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6069 + Length: 0 + ReplacementText: " /*\n DPCT1003:33: Migrated API does not return error code. (*, 0) is inserted. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6087 + Length: 17 + ReplacementText: '(sycl::free(d_Input, dpct::get_default_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Offset: 6104 + Length: 0 + ReplacementText: ', 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + Digest: a415e6d147e03637b5e283e388bc2d95 + - MainSourceFile: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + Digest: 701666f627dc80bcbb7d7e3ae2f7bf55 +DpctVersion: 2023.0.0 +MainHelperFileName: dpct +USMLevel: '' +FeatureMap: + device.hpp: + dev_mgr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: + dev_mgr_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_3: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_4: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dev_mgr + SubFeatureMap: {} + dev_mgr_check_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::check_id' + SubFeatureMap: {} + dev_mgr_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::current_device' + SubFeatureMap: {} + dev_mgr_current_device_id: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'dev_mgr::current_device_id' + SubFeatureMap: {} + dev_mgr_device_count: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::device_count' + SubFeatureMap: {} + dev_mgr_get_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::get_device' + SubFeatureMap: {} + dev_mgr_select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'dev_mgr::select_device' + SubFeatureMap: {} + device_ext: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: + device_ext_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: {} + device_ext_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: device_ext + SubFeatureMap: {} + device_ext_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::default_queue' + SubFeatureMap: {} + device_ext_get_device_info_return_info: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_device_info' + SubFeatureMap: {} + device_ext_get_device_info_return_void: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_device_info' + SubFeatureMap: {} + device_ext_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_integrated' + SubFeatureMap: {} + device_ext_get_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_major_version' + SubFeatureMap: {} + device_ext_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_clock_frequency' + SubFeatureMap: {} + device_ext_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_max_compute_units' + SubFeatureMap: {} + device_ext_get_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_minor_version' + SubFeatureMap: {} + device_ext_get_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_ext::get_version' + SubFeatureMap: {} + device_ext_queues_wait_and_throw: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'device_ext::queues_wait_and_throw' + SubFeatureMap: {} + device_info: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: device_info + SubFeatureMap: + device_info_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: device_info + SubFeatureMap: {} + device_info_get_integrated: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_integrated' + SubFeatureMap: {} + device_info_get_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_max_clock_frequency' + SubFeatureMap: {} + device_info_get_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::get_max_compute_units' + SubFeatureMap: {} + device_info_set_global_mem_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_global_mem_size' + SubFeatureMap: {} + device_info_set_host_unified_memory: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_host_unified_memory' + SubFeatureMap: {} + device_info_set_local_mem_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_local_mem_size' + SubFeatureMap: {} + device_info_set_major_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_major_version' + SubFeatureMap: {} + device_info_set_max_clock_frequency: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_clock_frequency' + SubFeatureMap: {} + device_info_set_max_compute_units: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_compute_units' + SubFeatureMap: {} + device_info_set_max_nd_range_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_nd_range_size' + SubFeatureMap: {} + device_info_set_max_sub_group_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_sub_group_size' + SubFeatureMap: {} + device_info_set_max_work_group_size: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_group_size' + SubFeatureMap: {} + device_info_set_max_work_item_sizes: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_item_sizes' + SubFeatureMap: {} + device_info_set_max_work_items_per_compute_unit: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_max_work_items_per_compute_unit' + SubFeatureMap: {} + device_info_set_minor_version: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_minor_version' + SubFeatureMap: {} + device_info_set_name: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: 'device_info::set_name' + SubFeatureMap: {} + exception_handler: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: exception_handler + SubFeatureMap: {} + get_current_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_current_device + SubFeatureMap: {} + get_default_queue: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_default_queue + SubFeatureMap: {} + get_tid: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_tid + SubFeatureMap: {} + select_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Common/helper_cuda.h' + FeatureName: select_device + SubFeatureMap: {} + dpct.hpp: + non_local_include_dependency: + IsCalled: true + CallerSrcFiles: + - '' + FeatureName: '' + SubFeatureMap: {} + memory.hpp: + constant_memory_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: constant_memory + SubFeatureMap: {} + device_memory: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: + device_memory_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_3: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_4: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_5: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_6: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: device_memory + SubFeatureMap: {} + device_memory_allocate_device: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::allocate_device' + SubFeatureMap: {} + device_memory_get_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_get_ptr_q: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::get_ptr' + SubFeatureMap: {} + device_memory_init: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + device_memory_init_q: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::init' + SubFeatureMap: {} + device_memory_value_t_alias: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: 'device_memory::value_t' + SubFeatureMap: {} + dpct_free: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: dpct_free + SubFeatureMap: {} + dpct_get_copy_range_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_get_copy_range_detail + SubFeatureMap: {} + dpct_get_offset_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_get_offset_detail + SubFeatureMap: {} + dpct_malloc_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: dpct_malloc + SubFeatureMap: {} + dpct_memcpy: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d_3d_pitch_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_2d_pitch_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_3d_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + dpct_memcpy_detail: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: dpct_memcpy + SubFeatureMap: {} + get_memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_memcpy_direction + SubFeatureMap: {} + get_pointer_attribute: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: get_pointer_attribute + SubFeatureMap: {} + mem_mgr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: + mem_mgr_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: {} + mem_mgr_2: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: mem_mgr + SubFeatureMap: {} + memcpy_direction: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: memcpy_direction + SubFeatureMap: {} + memory_region: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: memory_region + SubFeatureMap: {} + memory_traits: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: memory_traits + SubFeatureMap: {} + pitched_data: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pitched_data + SubFeatureMap: + pitched_data_get_data_ptr: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_data_ptr' + SubFeatureMap: {} + pitched_data_get_pitch: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_pitch' + SubFeatureMap: {} + pitched_data_get_y: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: 'pitched_data::get_y' + SubFeatureMap: {} + pitched_data_1: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pitched_data + SubFeatureMap: {} + pointer_access_attribute: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp' + FeatureName: pointer_access_attribute + SubFeatureMap: {} + typedef_buffer_t: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: buffer_t + SubFeatureMap: {} + typedef_byte_t: + IsCalled: true + CallerSrcFiles: + - '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu' + FeatureName: byte_t + SubFeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/home/tcs/Manjula_workspace/cuda-samples' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '/home/tcs/Manjula_workspace/cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable' + Specified: true + CtadEnabled: + Value: 'false' + Specified: false + CustomHelperFileName: + Value: dpct + Specified: false + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp new file mode 100644 index 0000000000..e6c1672a52 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.dp.cpp @@ -0,0 +1,291 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "convolutionSeparable_common.h" + +//////////////////////////////////////////////////////////////////////////////// +// Convolution kernel storage +//////////////////////////////////////////////////////////////////////////////// +dpct::constant_memory c_Kernel(KERNEL_LENGTH); + +extern "C" void setConvolutionKernel(float *h_Kernel) { + dpct::get_default_queue() + .memcpy(c_Kernel.get_ptr(), h_Kernel, KERNEL_LENGTH * sizeof(float)) + .wait(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Row convolution filter +//////////////////////////////////////////////////////////////////////////////// +#define ROWS_BLOCKDIM_X 16 +#define ROWS_BLOCKDIM_Y 4 +#define ROWS_RESULT_STEPS 8 +#define ROWS_HALO_STEPS 1 + +void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW, + int imageH, int pitch, + sycl::nd_item<3> item_ct1, float *c_Kernel, + sycl::local_accessor s_Data) { + // Handle to thread block group + auto cta = item_ct1.get_group(); + + // Offset to the left halo edge + const int baseX = + (item_ct1.get_group(2) * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * + ROWS_BLOCKDIM_X + + item_ct1.get_local_id(2); + const int baseY = + item_ct1.get_group(1) * ROWS_BLOCKDIM_Y + item_ct1.get_local_id(1); + + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; + +// Load main data +#pragma unroll + + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X] = + d_Src[i * ROWS_BLOCKDIM_X]; + } + +// Load left halo +#pragma unroll + + for (int i = 0; i < ROWS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X] = + (baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; + } + +// Load right halo +#pragma unroll + + for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; + i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(1)][item_ct1.get_local_id(2) + + i * ROWS_BLOCKDIM_X] = + (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; + } + + // Compute and store results + /* + DPCT1065:0: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + +float a[2*KERNEL_RADIUS + 1]; +#pragma unroll + for(int i=0; i<= 2*KERNEL_RADIUS; i++) + { + a[i]=c_Kernel[i]; + } +#pragma unroll + + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + float sum =0; + +//#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += a[KERNEL_RADIUS - j] * + s_Data[item_ct1.get_local_id(1)] + [item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X + j]; + } + d_Dst[i * ROWS_BLOCKDIM_X] = sum; + } +} + +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH) { + assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0); + assert(imageH % ROWS_BLOCKDIM_Y == 0); + + sycl::range<3> blocks(1, imageH / ROWS_BLOCKDIM_Y, + imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X)); + sycl::range<3> threads(1, ROWS_BLOCKDIM_Y, ROWS_BLOCKDIM_X); + + /* + DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + dpct::get_default_queue().submit([&](sycl::handler &cgh) { + c_Kernel.init(); + + auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr(); + + sycl::local_accessor s_Data_acc_ct1(sycl::range<2>( + 4 /*4*/, + 160 /*(ROWS_RESULT_STEPS + + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X*/), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + convolutionRowsKernel(d_Dst, d_Src, imageW, imageH, + imageW, item_ct1, c_Kernel_ptr_ct1, + s_Data_acc_ct1); + }); + }); + getLastCudaError("convolutionRowsKernel() execution failed\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// Column convolution filter +//////////////////////////////////////////////////////////////////////////////// +#define COLUMNS_BLOCKDIM_X 16 +#define COLUMNS_BLOCKDIM_Y 8 +#define COLUMNS_RESULT_STEPS 8 +#define COLUMNS_HALO_STEPS 1 + +void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW, + int imageH, int pitch, + sycl::nd_item<3> item_ct1, + float *c_Kernel, + sycl::local_accessor s_Data) { + // Handle to thread block group + auto cta = item_ct1.get_group(); + + // Offset to the upper halo edge + const int baseX = + item_ct1.get_group(2) * COLUMNS_BLOCKDIM_X + item_ct1.get_local_id(2); + const int baseY = + (item_ct1.get_group(1) * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * + COLUMNS_BLOCKDIM_Y + + item_ct1.get_local_id(1); + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; + +// Main data +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]; + } + +// Upper halo +#pragma unroll + + for (int i = 0; i < COLUMNS_HALO_STEPS; i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + (baseY >= -i * COLUMNS_BLOCKDIM_Y) + ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] + : 0; + } + +// Lower halo +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; + i++) { + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y] = + (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) + ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] + : 0; + } + + // Compute and store results + /* + DPCT1065:2: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + + +float a[2*KERNEL_RADIUS + 1]; +#pragma unroll + for(int i=0; i<= 2*KERNEL_RADIUS; i++) + { + a[i]=c_Kernel[i]; + } +//#pragma unroll + + for (int i = COLUMNS_HALO_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + float sum =0; +#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += a[KERNEL_RADIUS - j] * + s_Data[item_ct1.get_local_id(2)] + [item_ct1.get_local_id(1) + i * COLUMNS_BLOCKDIM_Y + j]; + } + d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; + } +} + +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH) { + assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % COLUMNS_BLOCKDIM_X == 0); + assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0); + + sycl::range<3> blocks(1, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y), + imageW / COLUMNS_BLOCKDIM_X); + sycl::range<3> threads(1, COLUMNS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_X); + + /* + DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + dpct::get_default_queue().submit([&](sycl::handler &cgh) { + c_Kernel.init(); + + auto c_Kernel_ptr_ct1 = c_Kernel.get_ptr(); + + sycl::local_accessor s_Data_acc_ct1( + sycl::range<2>(16 /*16*/, 81 /*(COLUMNS_RESULT_STEPS + + 2 * COLUMNS_HALO_STEPS) * + COLUMNS_BLOCKDIM_Y + + 1*/), cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + convolutionColumnsKernel( + d_Dst, d_Src, imageW, imageH, imageW, item_ct1, + c_Kernel_ptr_ct1, s_Data_acc_ct1); + }); + }); + getLastCudaError("convolutionColumnsKernel() execution failed\n"); +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h new file mode 100644 index 0000000000..9dba4a5a42 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CONVOLUTIONSEPARABLE_COMMON_H +#define CONVOLUTIONSEPARABLE_COMMON_H + +#define KERNEL_RADIUS 8 +#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1) + +//////////////////////////////////////////////////////////////////////////////// +// Reference CPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, + int imageW, int imageH, int kernelR); + +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +//////////////////////////////////////////////////////////////////////////////// +// GPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void setConvolutionKernel(float *h_Kernel); + +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH); + +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, + int imageH); + +#endif diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp new file mode 100644 index 0000000000..e8a40ca816 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp @@ -0,0 +1,69 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "convolutionSeparable_common.h" + +//////////////////////////////////////////////////////////////////////////////// +// Reference row convolution filter +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, + int imageW, int imageH, int kernelR) { + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; + + for (int k = -kernelR; k <= kernelR; k++) { + int d = x + k; + + if (d >= 0 && d < imageW) + sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; + } + + h_Dst[y * imageW + x] = sum; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Reference column convolution filter +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, + float *h_Kernel, int imageW, int imageH, + int kernelR) { + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; + + for (int k = -kernelR; k <= kernelR; k++) { + int d = y + k; + + if (d >= 0 && d < imageH) + sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; + } + + h_Dst[y * imageW + x] = sum; + } +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp new file mode 100644 index 0000000000..7bb43b3f8a --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp.dp.cpp @@ -0,0 +1,226 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +* This sample implements a separable convolution filter +* of a 2D image with an arbitrary kernel. +*/ + +// CUDA runtime +#include +#include + +// Utilities and system includes +#include +#include + +#include "convolutionSeparable_common.h" +#include + +//////////////////////////////////////////////////////////////////////////////// +// Reference CPU convolution +//////////////////////////////////////////////////////////////////////////////// +extern "C" void convolutionRowCPU(float *h_Result, float *h_Data, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +extern "C" void convolutionColumnCPU(float *h_Result, float *h_Data, + float *h_Kernel, int imageW, int imageH, + int kernelR); + +//////////////////////////////////////////////////////////////////////////////// +// Main program +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + // start logs + printf("[%s] - Starting...\n", argv[0]); + + float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; + + float *d_Input, *d_Output, *d_Buffer; + + const int imageW = 3072; + const int imageH = 3072; + const int iterations = 16; + + StopWatchInterface *hTimer = NULL; + + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s +// findCudaDevice(argc, (const char **)argv); + std::cout << "\nRunning on " + << dpct::get_default_queue().get_device().get_info() << "\n"; + + sdkCreateTimer(&hTimer); + + printf("Image Width x Height = %i x %i\n\n", imageW, imageH); + printf("Allocating and initializing host arrays...\n"); + h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); + h_Input = (float *)malloc(imageW * imageH * sizeof(float)); + h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); + srand(200); + + for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { + h_Kernel[i] = (float)(rand() % 16); + } + + for (unsigned i = 0; i < imageW * imageH; i++) { + h_Input[i] = (float)(rand() % 16); + } + + printf("Allocating and initializing CUDA arrays...\n"); + /* + DPCT1003:24: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Input = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + /* + DPCT1003:25: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Output = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + /* + DPCT1003:26: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((d_Buffer = sycl::malloc_device( + imageW * imageH, dpct::get_default_queue()), + 0)); + + setConvolutionKernel(h_Kernel); + /* + DPCT1003:27: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors( + (dpct::get_default_queue() + .memcpy(d_Input, h_Input, imageW * imageH * sizeof(float)) + .wait(), + 0)); + + printf("Running GPU convolution (%u identical iterations)...\n\n", + iterations); + + for (int i = -1; i < iterations; i++) { + // i == -1 -- warmup iteration + if (i == 0) { + /* + DPCT1003:28: Migrated API does not return error code. (*, 0) is inserted. + You may need to rewrite this code. + */ + checkCudaErrors((dpct::get_current_device().queues_wait_and_throw(), 0)); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + convolutionRowsGPU(d_Buffer, d_Input, imageW, imageH); + + convolutionColumnsGPU(d_Output, d_Buffer, imageW, imageH); + } + + /* + DPCT1003:29: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((dpct::get_current_device().queues_wait_and_throw(), 0)); + sdkStopTimer(&hTimer); + double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations; + printf( + "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, " + "Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", + (1.0e-6 * (double)(imageW * imageH) / gpuTime), gpuTime, + (imageW * imageH), 1, 0); + + printf("\nReading back GPU results...\n\n"); + /* + DPCT1003:30: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors( + (dpct::get_default_queue() + .memcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float)) + .wait(), + 0)); + + printf("Checking the results...\n"); + printf(" ...running convolutionRowCPU()\n"); + convolutionRowCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); + + printf(" ...running convolutionColumnCPU()\n"); + convolutionColumnCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, + KERNEL_RADIUS); + + printf(" ...comparing the results\n"); + double sum = 0, delta = 0; + + for (unsigned i = 0; i < imageW * imageH; i++) { + delta += + (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); + sum += h_OutputCPU[i] * h_OutputCPU[i]; + } + + double L2norm = sqrt(delta / sum); + printf(" ...Relative L2 norm: %E\n\n", L2norm); + printf("Shutting down...\n"); + + /* + DPCT1003:31: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Buffer, dpct::get_default_queue()), 0)); + /* + DPCT1003:32: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Output, dpct::get_default_queue()), 0)); + /* + DPCT1003:33: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. + */ + checkCudaErrors((sycl::free(d_Input, dpct::get_default_queue()), 0)); + free(h_OutputGPU); + free(h_OutputCPU); + free(h_Buffer); + free(h_Input); + free(h_Kernel); + + sdkDeleteTimer(&hTimer); + + if (L2norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/device.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/device.hpp new file mode 100644 index 0000000000..c34711c142 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/device.hpp @@ -0,0 +1,388 @@ +//==---- device.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_DEVICE_HPP__ +#define __DPCT_DEVICE_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__linux__) +#include +#include +#endif +#if defined(_WIN64) +#define NOMINMAX +#include +#endif + + +namespace dpct { + +/// SYCL default exception handler +auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cerr << "Caught asynchronous SYCL exception:" << std::endl + << e.what() << std::endl + << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + } + } +}; + +class device_info { +public: + + int get_integrated() const { return _integrated; } + + int get_max_clock_frequency() const { return _frequency; } + + int get_max_compute_units() const { return _max_compute_units; } + + void set_name(const char* name) { + size_t length = strlen(name); + if (length < 256) { + std::memcpy(_name, name, length + 1); + } else { + std::memcpy(_name, name, 255); + _name[255] = '\0'; + } + } + + void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) { + _max_work_item_sizes = max_work_item_sizes; + for (int i = 0; i < 3; ++i) + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + + void set_host_unified_memory(bool host_unified_memory) { + _host_unified_memory = host_unified_memory; + } + + void set_major_version(int major) { _major = major; } + + void set_minor_version(int minor) { _minor = minor; } + + void set_max_clock_frequency(int frequency) { _frequency = frequency; } + + void set_max_compute_units(int max_compute_units) { + _max_compute_units = max_compute_units; + } + + void set_global_mem_size(size_t global_mem_size) { + _global_mem_size = global_mem_size; + } + + void set_local_mem_size(size_t local_mem_size) { + _local_mem_size = local_mem_size; + } + + void set_max_work_group_size(int max_work_group_size) { + _max_work_group_size = max_work_group_size; + } + + void set_max_sub_group_size(int max_sub_group_size) { + _max_sub_group_size = max_sub_group_size; + } + + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) { + _max_work_items_per_compute_unit = max_work_items_per_compute_unit; + } + + void set_max_nd_range_size(int max_nd_range_size[]) { + for (int i = 0; i < 3; i++) { + _max_nd_range_size[i] = max_nd_range_size[i]; + _max_nd_range_size_i[i] = max_nd_range_size[i]; + } + } + +private: + char _name[256]; + sycl::id<3> _max_work_item_sizes; + int _max_work_item_sizes_i[3]; + bool _host_unified_memory = false; + int _major; + int _minor; + int _integrated = 0; + int _frequency; + int _max_compute_units; + int _max_work_group_size; + int _max_sub_group_size; + int _max_work_items_per_compute_unit; + size_t _global_mem_size; + size_t _local_mem_size; + size_t _max_nd_range_size[3]; + int _max_nd_range_size_i[3]; +}; + +/// dpct device extension +class device_ext : public sycl::device { +public: + device_ext() : sycl::device(), _ctx(*this) {} + ~device_ext() { + std::lock_guard lock(m_mutex); + for (auto &task : _tasks) { + if (task.joinable()) + task.join(); + } + _tasks.clear(); + _queues.clear(); + } + device_ext(const sycl::device &base) + : sycl::device(base), _ctx(*this) { + _queues.push_back(std::make_shared( + _ctx, base, exception_handler, sycl::property::queue::in_order())); + _saved_queue = _default_queue = _queues[0].get(); + } + + int get_major_version() const { + int major, minor; + get_version(major, minor); + return major; + } + + int get_minor_version() const { + int major, minor; + get_version(major, minor); + return minor; + } + + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } + + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } + + int get_integrated() const { return get_device_info().get_integrated(); } + + void get_device_info(device_info &out) const { + device_info prop; + prop.set_name(get_info().c_str()); + + int major, minor; + get_version(major, minor); + prop.set_major_version(major); + prop.set_minor_version(minor); + + prop.set_max_work_item_sizes( +#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION<20220902) + // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes is an enum class element + get_info()); +#else + // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by an int + get_info>()); +#endif + prop.set_host_unified_memory( + this->has(sycl::aspect::usm_host_allocations)); + + prop.set_max_clock_frequency( + get_info()); + + prop.set_max_compute_units( + get_info()); + prop.set_max_work_group_size( + get_info()); + prop.set_global_mem_size( + get_info()); + prop.set_local_mem_size(get_info()); + + size_t max_sub_group_size = 1; + std::vector sub_group_sizes = + get_info(); + + for (const auto &sub_group_size : sub_group_sizes) { + if (max_sub_group_size < sub_group_size) + max_sub_group_size = sub_group_size; + } + + prop.set_max_sub_group_size(max_sub_group_size); + + prop.set_max_work_items_per_compute_unit( + get_info()); + int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + prop.set_max_nd_range_size(max_nd_range_size); + + out = prop; + } + + device_info get_device_info() const { + device_info prop; + get_device_info(prop); + return prop; + } + + sycl::queue &default_queue() { return *_default_queue; } + + void queues_wait_and_throw() { + std::unique_lock lock(m_mutex); + std::vector> current_queues( + _queues); + lock.unlock(); + for (const auto &q : current_queues) { + q->wait_and_throw(); + } + // Guard the destruct of current_queues to make sure the ref count is safe. + lock.lock(); + } + +private: + + void get_version(int &major, int &minor) const { + // Version string has the following format: + // a. OpenCL + // b. + std::string ver; + ver = get_info(); + std::string::size_type i = 0; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; + } + major = std::stoi(&(ver[i])); + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + i++; + minor = std::stoi(&(ver[i])); + } + + sycl::queue *_default_queue; + sycl::queue *_saved_queue; + sycl::context _ctx; + std::vector> _queues; + mutable std::mutex m_mutex; + std::vector _tasks; +}; + +static inline unsigned int get_tid() { +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(_WIN64) + return GetCurrentThreadId(); +#else +#error "Only support Windows and Linux." +#endif +} + +/// device manager +class dev_mgr { + +public: + + device_ext ¤t_device() { + unsigned int dev_id=current_device_id(); + check_id(dev_id); + return *_devs[dev_id]; + } + + device_ext &get_device(unsigned int id) const { + std::lock_guard lock(m_mutex); + check_id(id); + return *_devs[id]; + } + + unsigned int current_device_id() const { + std::lock_guard lock(m_mutex); + auto it=_thread2dev_map.find(get_tid()); + if(it != _thread2dev_map.end()) + return it->second; + return DEFAULT_DEVICE_ID; + } + + void select_device(unsigned int id) { + std::lock_guard lock(m_mutex); + check_id(id); + _thread2dev_map[get_tid()]=id; + } + + unsigned int device_count() { return _devs.size(); } + + /// Returns the instance of device manager singleton. + static dev_mgr &instance() { + static dev_mgr d_m; + return d_m; + } + dev_mgr(const dev_mgr &) = delete; + dev_mgr &operator=(const dev_mgr &) = delete; + dev_mgr(dev_mgr &&) = delete; + dev_mgr &operator=(dev_mgr &&) = delete; + +private: + mutable std::mutex m_mutex; + dev_mgr() { + sycl::device default_device = + sycl::device(sycl::default_selector_v); + _devs.push_back(std::make_shared(default_device)); + + std::vector sycl_all_devs = + sycl::device::get_devices(sycl::info::device_type::all); + // Collect other devices except for the default device. + if (default_device.is_cpu()) + _cpu_device = 0; + for (auto &dev : sycl_all_devs) { + if (dev == default_device) { + continue; + } + _devs.push_back(std::make_shared(dev)); + if (_cpu_device == -1 && dev.is_cpu()) { + _cpu_device = _devs.size() - 1; + } + } + } + + void check_id(unsigned int id) const { + if (id >= _devs.size()) { + throw std::runtime_error("invalid device id"); + } + } + + std::vector> _devs; + /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current + /// thread id in _thread2dev_map, which means default device should be used + /// for the current thread. + const unsigned int DEFAULT_DEVICE_ID = 0; + /// thread-id to device-id map. + std::map _thread2dev_map; + int _cpu_device = -1; +}; + +/// Util function to get the default queue of current device in +/// dpct device manager. +static inline sycl::queue &get_default_queue() { + return dev_mgr::instance().current_device().default_queue(); +} + +/// Util function to get the current device. +static inline device_ext &get_current_device() { + return dev_mgr::instance().current_device(); +} + +static inline unsigned int select_device(unsigned int id){ + dev_mgr::instance().select_device(id); + return id; +} + +} // namespace dpct + +#endif // __DPCT_DEVICE_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/dpct.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/dpct.hpp new file mode 100644 index 0000000000..3099386e04 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/dpct.hpp @@ -0,0 +1,19 @@ +//==---- dpct.hpp ---------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_HPP__ +#define __DPCT_HPP__ + +#include +#include +#include + +#include "device.hpp" +#include "memory.hpp" + +#endif // __DPCT_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/memory.hpp b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/memory.hpp new file mode 100644 index 0000000000..75bf8ec102 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/02_sycl_migrated_optimized/include/dpct/memory.hpp @@ -0,0 +1,600 @@ +//==---- memory.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_MEMORY_HPP__ +#define __DPCT_MEMORY_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#elif defined(_WIN64) +#define NOMINMAX +#include +#else +#error "Only support Windows and Linux." +#endif + +#include "device.hpp" + +namespace dpct { + +enum memcpy_direction { + host_to_host, + host_to_device, + device_to_host, + device_to_device, + automatic +}; + +enum memory_region { + global = 0, // device global memory + constant, // device constant memory + local, // device local memory + shared, // memory which can be accessed by host and device +}; + +typedef uint8_t byte_t; + +/// Buffer type to be used in Memory Management runtime. +typedef sycl::buffer buffer_t; + +/// Pitched 2D/3D memory data. +class pitched_data { +public: + pitched_data() : pitched_data(nullptr, 0, 0, 0) {} + pitched_data(void *data, size_t pitch, size_t x, size_t y) + : _data(data), _pitch(pitch), _x(x), _y(y) {} + + void *get_data_ptr() { return _data; } + + size_t get_pitch() { return _pitch; } + + size_t get_y() { return _y; } + +private: + void *_data; + size_t _pitch, _x, _y; +}; + +namespace detail { + +class mem_mgr { + mem_mgr() { + // Reserved address space, no real memory allocation happens here. +#if defined(__linux__) + mapped_address_space = + (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#elif defined(_WIN64) + mapped_address_space = (byte_t *)VirtualAlloc( + NULL, // NULL specified as the base address parameter + mapped_region_size, // Size of allocation + MEM_RESERVE, // Allocate reserved pages + PAGE_NOACCESS); // Protection = no access +#else +#error "Only support Windows and Linux." +#endif + next_free = mapped_address_space; + }; + +public: + using buffer_id_t = int; + + struct allocation { + buffer_t buffer; + byte_t *alloc_ptr; + size_t size; + }; + + ~mem_mgr() { +#if defined(__linux__) + munmap(mapped_address_space, mapped_region_size); +#elif defined(_WIN64) + VirtualFree(mapped_address_space, 0, MEM_RELEASE); +#else +#error "Only support Windows and Linux." +#endif + }; + + mem_mgr(const mem_mgr &) = delete; + mem_mgr &operator=(const mem_mgr &) = delete; + mem_mgr(mem_mgr &&) = delete; + mem_mgr &operator=(mem_mgr &&) = delete; + + /// Returns the instance of memory manager singleton. + static mem_mgr &instance() { + static mem_mgr m; + return m; + } + +private: + std::map m_map; + mutable std::mutex m_mutex; + byte_t *mapped_address_space; + byte_t *next_free; + const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; + const size_t alignment = 256; + /// This padding may be defined to some positive value to debug + /// out of bound accesses. + const size_t extra_padding = 0; + +}; + +template class accessor; +template class memory_traits { +public: + static constexpr sycl::access::target target = + (Memory == constant) ? sycl::access::target::constant_buffer + : sycl::access::target::device; + static constexpr sycl::access_mode mode = + (Memory == constant) ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional::type; + using value_t = typename std::remove_cv::type; + template + using accessor_t = typename std::conditional< + Memory == local, sycl::local_accessor, + sycl::accessor>::type; + using pointer_t = T *; +}; + +static inline void *dpct_malloc(size_t size, sycl::queue &q) { + return sycl::malloc_device(size, q.get_device(), q.get_context()); +} + +enum class pointer_access_attribute { + host_only = 0, + device_only, + host_device, + end +}; + +static pointer_access_attribute get_pointer_attribute(sycl::queue &q, + const void *ptr) { + switch (sycl::get_pointer_type(ptr, q.get_context())) { + case sycl::usm::alloc::unknown: + return pointer_access_attribute::host_only; + case sycl::usm::alloc::device: + return pointer_access_attribute::device_only; + case sycl::usm::alloc::shared: + case sycl::usm::alloc::host: + return pointer_access_attribute::host_device; + } +} + +static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, + const void *from_ptr, + memcpy_direction dir) { + switch (dir) { + case memcpy_direction::host_to_host: + case memcpy_direction::host_to_device: + case memcpy_direction::device_to_host: + case memcpy_direction::device_to_device: + return dir; + case memcpy_direction::automatic: { + // table[to_attribute][from_attribute] + static const memcpy_direction + direction_table[static_cast(pointer_access_attribute::end)] + [static_cast(pointer_access_attribute::end)] = + {{memcpy_direction::host_to_host, + memcpy_direction::device_to_host, + memcpy_direction::host_to_host}, + {memcpy_direction::host_to_device, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}, + {memcpy_direction::host_to_host, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}}; + return direction_table[static_cast(get_pointer_attribute( + q, to_ptr))][static_cast(get_pointer_attribute(q, from_ptr))]; + } + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } +} + +static sycl::event +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector &dep_events = {}) { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); +} + +// Get actual copy range and make sure it will not exceed range. +static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); +} + +static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); +} + +/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr +/// and \p from_range to another specified by \p to_ptr and \p to_range. +static inline std::vector +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector &dep_events = {}) { + // RAII for host pointer + class host_buffer { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() { + if (_buf) { + _q.submit([&](sycl::handler &cgh) { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); + }); + } + } + }; + std::vector event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } else { + for (size_t y = 0; y < size.get(1); ++y) { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector host_events; + if (to_slice == size_slice) { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } else { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); + })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; +} + +/// memcpy 2D/3D matrix specified by pitched_data. +static inline std::vector +dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); +} + +/// memcpy 2D matrix with pitch. +static inline std::vector +dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); +} + +} // namespace detail + +/// free +/// \param ptr Point to free. +/// \param q Queue to execute the free task. +/// \returns no return value. +static inline void dpct_free(void *ptr, + sycl::queue &q = get_default_queue()) { + if (ptr) { + sycl::free(ptr, q.get_context()); + } +} + +/// Synchronously copies \p size bytes from the address specified by \p from_ptr +/// to the address specified by \p to_ptr. The value of \p direction is used to +/// set the copy direction, it can be \a host_to_host, \a host_to_device, +/// \a device_to_host, \a device_to_device or \a automatic. The function will +/// return after the copy is completed. +/// +/// \param to_ptr Pointer to destination memory address. +/// \param from_ptr Pointer to source memory address. +/// \param size Number of bytes to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static void dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction = automatic, + sycl::queue &q = get_default_queue()) { + detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction).wait(); +} + +/// Synchronously copies 2D matrix specified by \p x and \p y from the address +/// specified by \p from_ptr to the address specified by \p to_ptr, while \p +/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix +/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to +/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a +/// device_to_host, \a device_to_device or \a automatic. The function will +/// return after the copy is completed. +/// +/// \param to_ptr Pointer to destination memory address. +/// \param to_pitch Range of dim x in bytes of destination matrix. +/// \param from_ptr Pointer to source memory address. +/// \param from_pitch Range of dim x in bytes of source matrix. +/// \param x Range of dim x of matrix to be copied. +/// \param y Range of dim y of matrix to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void dpct_memcpy(void *to_ptr, size_t to_pitch, + const void *from_ptr, size_t from_pitch, + size_t x, size_t y, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) { + sycl::event::wait(detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, + from_pitch, x, y, direction)); +} + +/// Synchronously copies a subset of a 3D matrix specified by \p to to another +/// 3D matrix specified by \p from. The from and to position info are specified +/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size. +/// The value of \p direction is used to set the copy direction, it can be \a +/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or +/// \a automatic. The function will return after the copy is completed. +/// +/// \param to Destination matrix info. +/// \param to_pos Position of destination. +/// \param from Source matrix info. +/// \param from_pos Position of destination. +/// \param size Range of the submatrix to be copied. +/// \param direction Direction of the copy. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void dpct_memcpy(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) { + sycl::event::wait( + detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction)); +} + +namespace detail { + +/// Device variable with address space of shared, global or constant. +template +class device_memory { +public: + + using value_t = typename detail::memory_traits::value_t; + + device_memory() : device_memory(sycl::range(1)) {} + + /// Constructor of 1-D array with initializer list + device_memory( + const sycl::range &in_range, + std::initializer_list &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range.size()); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); + } + + /// Constructor of 2-D array with initializer list + template + device_memory( + const typename std::enable_if>::type &in_range, + std::initializer_list> &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range[0]); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + auto tmp_data = _host_ptr; + for (auto sub_list : init_list) { + assert(sub_list.size() <= in_range[1]); + std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T)); + tmp_data += in_range[1]; + } + } + + /// Constructor with range + device_memory(const sycl::range &range_in) + : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false), + _host_ptr(nullptr), _device_ptr(nullptr) { + static_assert( + (Memory == global) || (Memory == constant) || (Memory == shared), + "device memory region should be global, constant or shared"); + // Make sure that singleton class mem_mgr and dev_mgr will destruct later + // than this. + detail::mem_mgr::instance(); + dev_mgr::instance(); + } + + /// Constructor with range + template + device_memory(Args... Arguments) + : device_memory(sycl::range(Arguments...)) {} + + ~device_memory() { + if (_device_ptr && !_reference) + dpct_free(_device_ptr); + if (_host_ptr) + std::free(_host_ptr); + } + + /// Allocate memory with default queue, and init memory if has initial value. + void init() { + init(dpct::get_default_queue()); + } + + /// Allocate memory with specified queue, and init memory if has initial value. + void init(sycl::queue &q) { + if (_device_ptr) + return; + if (!_size) + return; + allocate_device(q); + if (_host_ptr) + detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, host_to_device); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr() { + return get_ptr(get_default_queue()); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue &q) { + init(q); + return _device_ptr; + } + + template + typename std::enable_if::type &operator[](size_t index) { + init(); + return _device_ptr[index]; + } + +private: + device_memory(value_t *memory_ptr, size_t size) + : _size(size), _range(size / sizeof(T)), _reference(true), + _device_ptr(memory_ptr) {} + + void allocate_device(sycl::queue &q) { + if (Memory == shared) { + _device_ptr = (value_t *)sycl::malloc_shared( + _size, q.get_device(), q.get_context()); + return; + } + _device_ptr = (value_t *)detail::dpct_malloc(_size, q); + } + + size_t _size; + sycl::range _range; + bool _reference; + value_t *_host_ptr; + value_t *_device_ptr; +}; +template +class device_memory : public device_memory { +public: + using base = device_memory; + using value_t = typename base::value_t; + + /// Constructor with initial value. + device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} + + /// Default constructor + device_memory() : base(1) {} + +}; + +} // namespace detail + +template +using constant_memory = detail::device_memory; + +} // namespace dpct + +#endif // __DPCT_MEMORY_HPP__ diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/CMakeLists.txt b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/CMakeLists.txt new file mode 100644 index 0000000000..ad426a73c1 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required (VERSION 3.4.0) + +set(CMAKE_CXX_COMPILER "icpx") + +project (guided_convolutionSeparable_SYCLmigration) +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif () + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +option (BUILD_SYCL_MIGRATED_SUBPROJECT "Build targets from subproject dpct_output" ON) +option (BUILD_SYCL_MIGRATED_OPTIMIZED_SUBPROJECT "Build targets from subproject convolutionSeparable_migrated_optimized" ON) + +if (BUILD_SYCL_MIGRATED_SUBPROJECT) + add_subdirectory (01_dpct_output) +endif () +if (BUILD_SYCL_MIGRATED_OPTIMIZED_SUBPROJECT) + add_subdirectory (02_sycl_migrated_optimized) +endif () diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/License.txt b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/License.txt new file mode 100644 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/README.md b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/README.md new file mode 100644 index 0000000000..dbacf19e01 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/README.md @@ -0,0 +1,211 @@ +# `convolutionSeparable` Sample + +The convolution separable is a process in which a single convolution can be divided into two or more convolutions to produce the same output. This sample is implemented using SYCL* by migrating code from original CUDA source code and offloading computations to a GPU/CPU. + +| Property | Description +|:--- |:--- +| What you will learn | Migrate convolutionSeparable from CUDA to SYCL and optimize it +| Time to complete | 15 minutes + +## Purpose + +The sample shows the migration of convolutionSeperable from CUDA to SYCL using SYCLomatic tool and optimizing the migrated sycl code further to acheive good results. + + +>**Note**: We use Intel® open-sources SYCLomatic migration tool which assists developers in porting CUDA code automatically to SYCL code. To finish the process, developers complete the rest of the coding manually and then tune to the desired level of performance for the target architecture. Users can also use Intel® DPC++ Compatibility Tool which comes along with the Intel® oneAPI Base Toolkit. + +This sample contains two versions in the following folders: + +| Folder Name | Description +|:--- |:--- +| `01_dpct_output` | Contains output of SYCLomatic Tool which is fully migrated version of CUDA code. +| `02_sycl_migrated_optimized` | Contains the optimized sycl code + +## Workflow For CUDA to SYCL migration + +Refer [Workflow](https://www.intel.com/content/www/us/en/developer/tools/oneapi/training/cuda-sycl-migration-workflow.html#gs.s2njvh) for details. + +## CUDA source code evaluation + +A Separable Convolution is a process in which a single convolution can be divided into two or more convolutions to produce the same output. This sample implements a separable convolution filter of a 2D image with an arbitrary kernel. There are two functions in the code named convolutionRowsGPU and convolutionColumnsGPU in which the kernel functions (convolutionRowsKernel & convolutionColumnsKernel) are called where the loading of the input data and computations are performed. We validate the results with reference CPU separable convolution implementation by calculating the relative L2 norm. + +This sample is migrated from NVIDIA CUDA sample. See the sample [convolutionSeparable](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/2_Concepts_and_Techniques/convolutionSeparable) in the NVIDIA/cuda-samples GitHub. + +## Prerequisites + +| Optimized for | Description +|:--- |:--- +| OS | Ubuntu* 20.04 +| Hardware | Intel® Gen9, Gen11 and Xeon CPU +| Software | SYCLomatic version 2023.0, Intel oneAPI Base Toolkit version 2023.0 + +For more information on how to install Syclomatic Tool, visit [Migrate from CUDA* to C++ with SYCL*](https://www.intel.com/content/www/us/en/developer/tools/oneapi/training/migrate-from-cuda-to-cpp-with-sycl.html#gs.v354cy). + +## Key Implementation Details + +This sample demonstrates the migration of the following CUDA features: + +- Shared memory +- Constant memory +- Cooperative groups + +## Build the `convolutionSeparable` Sample for CPU and GPU + +> **Note**: If you have not already done so, set up your CLI +> environment by sourcing the `setvars` script in the root of your oneAPI installation. +> +> Linux*: +> - For system wide installations: `. /opt/intel/oneapi/setvars.sh` +> - For private installations: ` . ~/intel/oneapi/setvars.sh` +> - For non-POSIX shells, like csh, use the following command: `bash -c 'source /setvars.sh ; exec csh'` +> +> For more information on configuring environment variables, see [Use the setvars Script with Linux* or macOS*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html). + +### Tool assisted migration – SYCLomatic + +For this sample, the SYCLomatic tool automatically migrates 100% of the CUDA runtime API's to SYCL. Follow these steps to generate the SYCL code using the compatibility tool: + +1. git clone https://github.com/NVIDIA/cuda-samples.git +2. cd cuda-samples/Samples/2_Concepts_and_Techniques/convolutionSeparable/ +3. Generate a compilation database with intercept-build + ``` + intercept-build make + ``` +4. The above step creates a JSON file named compile_commands.json with all the compiler invocations and stores the names of the input files and the compiler options. +5. Pass the JSON file as input to the SYCLomatic Tool. The result is written to a folder named dpct_output. The --in-root specifies path to the root of the source tree to be migrated. + ``` + c2s -p compile_commands.json --in-root ../../.. --use-custom-helper=api + ``` + +### Optimizations + +The migrated code can be optimized by using profiling tools which helps in identifying the hotspots (in this case convolutionRowsKernel() and convolutionColumnsKernel()). + +If we observe the migrated SYCL code, especially in the above-mentioned function calls we see many ‘for’ loops which are being unrolled. +Although loop unrolling exposes opportunities for instruction scheduling optimization by the compiler and thus can improve performance, sometimes it may increase pressure on register allocation and cause register spilling. + +So, it is always a good idea to compare the performance with and without loop unrolling along with different times of unrolls to decide if a loop should be unrolled or how many times to unroll it. + +In this case, by implementing the above technique, we can decrease the execution time by avoiding loop unrolling at the innermost “for-loop” of the computation part in convolutionRowsKernel function (line 120) and avoiding loop unrolling at the outer loop of the computation part in convolutionColumnsKernel function (line 242) of the file convolutionSeparable.dp.cpp. + +Also, we can still decrease the execution time by avoiding the repetitive loading of c_Kernel[] array (as it is independent of `i` for-loop in convolutionSeparable.dp.cpp file). + + ``` + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + float sum = 0; + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += c_Kernel[KERNEL_RADIUS - j] *s_Data[item_ct1.get_local_id(1)][item_ct1.get_local_id(2) + i * ROWS_BLOCKDIM_X + j];} + ``` + +We can separate the array and load it into another new array and use it in place of c_Kernel + + ``` + float a[2*KERNEL_RADIUS + 1]; + for(int i=0; i<= 2*KERNEL_RADIUS; i++) + a[i]=c_Kernel[i]; + ``` +>**Note**: These optimization techniques also work with the larger input image sizes. + +### On Linux* + +1. Change to the sample directory. +2. Build the program. + ``` + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + + By default, this command sequence will build the `dpct_output` as well as `sycl_migrated_optimized` versions of the program. + +3. Run the code + + You can run the programs for CPU and GPU. The commands indicate the device target. + + Run `dpct_output` on GPU. + ``` + make run + ``` + Run `dpct_output` on CPU. + ``` + export SYCL_DEVICE_FILTER=cpu + make run + unset SYCL_DEVICE_FILTER + ``` + Run `sycl_migrated_optimized` on GPU. + ``` + make run_smo + ``` + Run `sycl_migrated_optimized` on CPU. + ``` + export SYCL_DEVICE_FILTER=cpu + make run_smo + unset SYCL_DEVICE_FILTER + ``` +#### Troubleshooting + +If an error occurs, you can get more details by running `make` with +the `VERBOSE=1` argument: +``` +make VERBOSE=1 +``` +If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the [Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html) for more information on using the utility. + + +## Example output + +dpct_output + +``` +Image Width x Height = 3072 x 3072 + +Allocating and initializing host arrays... +Allocating and initializing CUDA arrays... +Running GPU convolution (16 identical iterations)... + +convolutionSeparable, Throughput = 3222.8755 MPixels/sec, Time = 0.00293 s, Size = 9437184 Pixels, NumDevsUsed = 1, Workgroup = 0 + +Reading back GPU results... + +Checking the results... + ...running convolutionRowCPU() + ...running convolutionColumnCPU() + ...comparing the results + ...Relative L2 norm: 0.000000E+00 + +Shutting down... +Test passed +Built target run_gpu +``` + +sycl_migrated_optimized + +``` +Image Width x Height = 3072 x 3072 + +Allocating and initializing host arrays... +Allocating and initializing CUDA arrays... +Running GPU convolution (16 identical iterations)... + +convolutionSeparable, Throughput = 21469.4930 MPixels/sec, Time = 0.00044 s, Size = 9437184 Pixels, NumDevsUsed = 1, Workgroup = 0 + +Reading back GPU results... + +Checking the results... + ...running convolutionRowCPU() + ...running convolutionColumnCPU() + ...comparing the results + ...Relative L2 norm: 0.000000E+00 + +Shutting down... +Test passed +Built target run_cmo_gpu +``` + +## License +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + +Third party program licenses are at [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt). + diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/sample.json b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/sample.json new file mode 100644 index 0000000000..74d693e83d --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/sample.json @@ -0,0 +1,27 @@ +{ + "guid": "842EBEB9-B902-4706-8C08-02E4EB550FC1", + "name": "convolutionSeparable", + "categories": ["Toolkit/oneAPI Direct Programming/C++SYCL/"], + "description": "This sample implements a separable convolution filter of a 2D signal with a gaussian kernel.", + "toolchain": [ "dpcpp" ], + "languages": [ { "cpp": {} } ], + "targetDevice": [ "CPU", "GPU" ], + "os": [ "linux" ], + "builder": [ "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run_cpu", + "make run_gpu", + "make run_cmo_cpu", + "make run_cmo_gpu" + ] + }] + + } + "expertise": "Code Optimization" +} diff --git a/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/third-party-programs.txt b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/third-party-programs.txt new file mode 100644 index 0000000000..12e4bd0c36 --- /dev/null +++ b/DirectProgramming/C++SYCL/guided_convolutionSeparable_SYCLmigration/third-party-programs.txt @@ -0,0 +1,518 @@ +oneAPI Code Samples - Third Party Programs File + +This file contains the list of third party software ("third party programs") +contained in the Intel software and their required notices and/or license +terms. This third party software, even if included with the distribution of the +Intel software, may be governed by separate license terms, including without +limitation, third party license terms, other Intel software license terms, and +open source software license terms. These separate license terms govern your use +of the third party programs as set forth in the “third-party-programs.txt” or +other similarly named text file. + +Third party programs and their corresponding required notices and/or license +terms are listed below. + +-------------------------------------------------------------------------------- +1. n-digit-mnist + +Apache License 2.0 + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +-------------------------------------------------------------------------------- +2. GNU-EFI + Copyright (c) 1998-2000 Intel Corporation + +The files in the "lib" and "inc" subdirectories are using the EFI Application +Toolkit distributed by Intel at http://developer.intel.com/technology/efi + +This code is covered by the following agreement: + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. THE EFI SPECIFICATION AND ALL OTHER INFORMATION +ON THIS WEB SITE ARE PROVIDED "AS IS" WITH NO WARRANTIES, AND ARE SUBJECT +TO CHANGE WITHOUT NOTICE. + +-------------------------------------------------------------------------------- +3. Edk2 + Copyright (c) 2019, Intel Corporation. All rights reserved. + + Edk2 Basetools + Copyright (c) 2019, Intel Corporation. All rights reserved. + +SPDX-License-Identifier: BSD-2-Clause-Patent + +-------------------------------------------------------------------------------- +4. Cuda-Samples + Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +5. Rodinia + Copyright (c)2008-2011 University of Virginia +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use this software or a modified version of it, please cite the most relevant among the following papers: + + - M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International +Symposium on Computer Architecture (ISCA), June 2010. + + - S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron. +Rodinia: A Benchmark Suite for Heterogeneous Computing. IEEE International Symposium +on Workload Characterization, Oct 2009. + +- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization +for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International +Conference on Supercomputing (ICS), June 2009. + +- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems +Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International +Symposium on Computer Architecture (ISCA), June 2009. + +- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA: +A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel +and Distributed Processing Symposium (IPDPS), May 2009. + +- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance +Study of General Purpose Applications on Graphics Processors using CUDA" Journal of +Parallel and Distributed Computing, Elsevier, June 2008. +-------------------------------------------------------------------------------- +6. Intel® Implicit SPMD Program Compiler (Intel® ISPC) - Renderkit samples + Copyright Intel Corporation + All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +-------------------------------------------------------------------------------- +7. Heat Transmission + +GNU LESSER GENERAL PUBLIC LICENSE +Version 3, 29 June 2007 + +Copyright © 2007 Free Software Foundation, Inc. + +Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. + +This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. + +0. Additional Definitions. +As used herein, “this License” refers to version 3 of the GNU Lesser General Public License, and the “GNU GPL” refers to version 3 of the GNU General Public License. + +“The Library” refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. + +An “Application” is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. + +A “Combined Work” is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the “Linked Version”. + +The “Minimal Corresponding Source” for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. + +The “Corresponding Application Code” for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. + +1. Exception to Section 3 of the GNU GPL. +You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. + +2. Conveying Modified Versions. +If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: + +a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or +b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. +3. Object Code Incorporating Material from Library Header Files. +The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: + +a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the object code with a copy of the GNU GPL and this license document. +4. Combined Works. +You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: + +a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. +b) Accompany the Combined Work with a copy of the GNU GPL and this license document. +c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. +d) Do one of the following: +0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. +1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. +e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) +5. Combined Libraries. +You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: + +a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. +b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. +6. Revised Versions of the GNU Lesser General Public License. +The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. + +If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. + +-------------------------------------------------------------------------------- +8. chart.js + Copyright (c) 2014-2021 Chart.js Contributors + + color + Copyright (c) 2018-2021 Jukka Kurkela + + Microsoft DirectX 11 Toolkit Engine Template: d3d11game_win32 + copyright 2015-2021 Microsoft Corp. + + Microsoft DirectX 11 Tutorial Wiki + + Nbody + (c) 2019 Fabio Baruffa + + Nothings/STB + Copyright (c) 2017 Sean Barrett + + Plotly.js + Copyright (c) 2020 Plotly, Inc + + pytracing + Copyright (c) 2015 Kris Wilson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- +9. Stream + +***NOTE: This is a modified version of Stream, hence sectin 3b of the license applies. + +* Copyright 1991-2003: John D. McCalpin +*----------------------------------------------------------------------- +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "STREAM benchmark results", +* published results must be in conformance to the STREAM +* Run Rules, (briefly reviewed below) published at +* http://www.cs.virginia.edu/stream/ref.html +* and incorporated herein by reference. +* As the copyright holder, John McCalpin retains the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned STREAM benchmark results" +* "based on a variant of the STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. + +-------------------------------------------------------------------------------- +10. FGPA example designs-gzip + + SDL2.0 + +zlib License + + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + +-------------------------------------------------------------------------------- +The following third party programs have their own third party program files as well. These additional third party program files are as follows: + +1. Intel® Implicit SPMD Program Compiler (Intel® ISPC)